{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 23583, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012721027859051011, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.834890842437744, "learning_rate": 0.0, "loss": 0.7982, "mean_token_accuracy": 0.7762961387634277, "num_tokens": 38493.0, "step": 1 }, { "epoch": 0.00025442055718102023, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.588568210601807, "learning_rate": 4.2390843577787196e-10, "loss": 0.8329, "mean_token_accuracy": 0.765798807144165, "num_tokens": 80419.0, "step": 2 }, { "epoch": 0.0003816308357715303, "ewc_loss": 1.709743457922741e-14, "ewc_loss_diag": 6.437450399132683e-19, "ewc_loss_parallel": 1.6479873021779667e-17, "grad_norm": 4.7255167961120605, "learning_rate": 8.478168715557439e-10, "loss": 0.7225, "mean_token_accuracy": 0.7960126996040344, "num_tokens": 118717.0, "step": 3 }, { "epoch": 0.0005088411143620405, "ewc_loss": 8.206768598029157e-13, "ewc_loss_diag": 5.941427905220564e-17, "ewc_loss_parallel": 7.598088824778415e-16, "grad_norm": 5.365298748016357, "learning_rate": 1.271725307333616e-09, "loss": 0.814, "mean_token_accuracy": 0.7710384130477905, "num_tokens": 150155.0, "step": 4 }, { "epoch": 0.0006360513929525506, "ewc_loss": 6.679101716144942e-12, "ewc_loss_diag": 1.5681900222830336e-15, "ewc_loss_parallel": 5.10702591327572e-15, "grad_norm": 4.3071088790893555, "learning_rate": 1.6956337431114878e-09, "loss": 0.7919, "mean_token_accuracy": 0.7744081020355225, "num_tokens": 193616.0, "step": 5 }, { "epoch": 0.0007632616715430606, "ewc_loss": 3.069544618483633e-11, "ewc_loss_diag": 5.6343818499726694e-15, "ewc_loss_parallel": 2.4980018054066022e-14, "grad_norm": 5.2172746658325195, "learning_rate": 2.1195421788893596e-09, "loss": 0.7894, "mean_token_accuracy": 0.7783465385437012, "num_tokens": 227640.0, "step": 6 }, { "epoch": 0.0008904719501335708, "ewc_loss": 6.002665031701326e-11, "ewc_loss_diag": 1.1102230246251565e-14, "ewc_loss_parallel": 4.907185768843192e-14, "grad_norm": 4.8332953453063965, "learning_rate": 2.543450614667232e-09, "loss": 0.8161, "mean_token_accuracy": 0.7747888565063477, "num_tokens": 265114.0, "step": 7 }, { "epoch": 0.001017682228724081, "ewc_loss": 2.546585164964199e-10, "ewc_loss_diag": 3.730349362740526e-14, "ewc_loss_parallel": 2.1671553440683056e-13, "grad_norm": 4.9345293045043945, "learning_rate": 2.967359050445104e-09, "loss": 0.7584, "mean_token_accuracy": 0.7888422012329102, "num_tokens": 299865.0, "step": 8 }, { "epoch": 0.001144892507314591, "ewc_loss": 3.7834979593753815e-10, "ewc_loss_diag": 5.617728504603292e-14, "ewc_loss_parallel": 3.2152058793144533e-13, "grad_norm": 4.575518608093262, "learning_rate": 3.3912674862229757e-09, "loss": 0.8139, "mean_token_accuracy": 0.7734696865081787, "num_tokens": 342063.0, "step": 9 }, { "epoch": 0.0012721027859051012, "ewc_loss": 7.385096978396177e-10, "ewc_loss_diag": 1.4299672557172016e-13, "ewc_loss_parallel": 5.968558980384842e-13, "grad_norm": 5.447710990905762, "learning_rate": 3.815175922000847e-09, "loss": 0.864, "mean_token_accuracy": 0.7640088796615601, "num_tokens": 374864.0, "step": 10 }, { "epoch": 0.0013993130644956112, "ewc_loss": 2.168235369026661e-09, "ewc_loss_diag": 2.4513724383723456e-13, "ewc_loss_parallel": 1.9184653865522705e-12, "grad_norm": 4.307745933532715, "learning_rate": 4.239084357778719e-09, "loss": 0.7765, "mean_token_accuracy": 0.775475263595581, "num_tokens": 416605.0, "step": 11 }, { "epoch": 0.0015265233430861213, "ewc_loss": 2.7794158086180687e-09, "ewc_loss_diag": 3.659295089164516e-13, "ewc_loss_parallel": 2.4158453015843406e-12, "grad_norm": 5.445792198181152, "learning_rate": 4.662992793556591e-09, "loss": 0.8343, "mean_token_accuracy": 0.7692118883132935, "num_tokens": 448798.0, "step": 12 }, { "epoch": 0.0016537336216766315, "ewc_loss": 3.4051481634378433e-09, "ewc_loss_diag": 4.298783551348606e-13, "ewc_loss_parallel": 2.9842794901924208e-12, "grad_norm": 5.453221321105957, "learning_rate": 5.086901229334464e-09, "loss": 0.8736, "mean_token_accuracy": 0.7548060417175293, "num_tokens": 480084.0, "step": 13 }, { "epoch": 0.0017809439002671415, "ewc_loss": 1.1117663234472275e-08, "ewc_loss_diag": 1.4566126083082054e-12, "ewc_loss_parallel": 9.663381206337363e-12, "grad_norm": 4.371667385101318, "learning_rate": 5.510809665112336e-09, "loss": 0.8056, "mean_token_accuracy": 0.7793096899986267, "num_tokens": 524543.0, "step": 14 }, { "epoch": 0.0019081541788576518, "ewc_loss": 1.6530975699424744e-08, "ewc_loss_diag": 1.9468870959826745e-12, "ewc_loss_parallel": 1.4551915228366852e-11, "grad_norm": 4.791067123413086, "learning_rate": 5.934718100890208e-09, "loss": 0.7516, "mean_token_accuracy": 0.7886589765548706, "num_tokens": 563314.0, "step": 15 }, { "epoch": 0.002035364457448162, "ewc_loss": 1.83936208486557e-08, "ewc_loss_diag": 2.1600499167107046e-12, "ewc_loss_parallel": 1.6257217794191092e-11, "grad_norm": 5.06108283996582, "learning_rate": 6.3586265366680796e-09, "loss": 0.8421, "mean_token_accuracy": 0.7626674175262451, "num_tokens": 598421.0, "step": 16 }, { "epoch": 0.002162574736038672, "ewc_loss": 2.2351741790771484e-08, "ewc_loss_diag": 2.7142732506035827e-12, "ewc_loss_parallel": 1.9667822925839573e-11, "grad_norm": 5.056787967681885, "learning_rate": 6.782534972445951e-09, "loss": 0.8318, "mean_token_accuracy": 0.7625830769538879, "num_tokens": 634690.0, "step": 17 }, { "epoch": 0.002289785014629182, "ewc_loss": 2.537854015827179e-08, "ewc_loss_diag": 3.169020601490047e-12, "ewc_loss_parallel": 2.2282620193436742e-11, "grad_norm": 5.124617576599121, "learning_rate": 7.206443408223823e-09, "loss": 0.869, "mean_token_accuracy": 0.7596092224121094, "num_tokens": 674653.0, "step": 18 }, { "epoch": 0.0024169952932196924, "ewc_loss": 7.35744833946228e-08, "ewc_loss_diag": 1.2221335055073723e-11, "ewc_loss_parallel": 6.139089236967266e-11, "grad_norm": 5.070525646209717, "learning_rate": 7.630351844001695e-09, "loss": 0.8007, "mean_token_accuracy": 0.7771714925765991, "num_tokens": 708238.0, "step": 19 }, { "epoch": 0.0025442055718102024, "ewc_loss": 1.0989606380462646e-07, "ewc_loss_diag": 1.432454155292362e-11, "ewc_loss_parallel": 9.549694368615746e-11, "grad_norm": 4.6204328536987305, "learning_rate": 8.054260279779567e-09, "loss": 0.7826, "mean_token_accuracy": 0.7787355780601501, "num_tokens": 749312.0, "step": 20 }, { "epoch": 0.0026714158504007124, "ewc_loss": 1.2386590242385864e-07, "ewc_loss_diag": 1.5916157281026244e-11, "ewc_loss_parallel": 1.07775122160092e-10, "grad_norm": 5.420673370361328, "learning_rate": 8.478168715557438e-09, "loss": 0.8523, "mean_token_accuracy": 0.7664451599121094, "num_tokens": 783532.0, "step": 21 }, { "epoch": 0.0027986261289912225, "ewc_loss": 1.3504177331924438e-07, "ewc_loss_diag": 1.7053025658242404e-11, "ewc_loss_parallel": 1.1823431123048067e-10, "grad_norm": 5.089050769805908, "learning_rate": 8.902077151335311e-09, "loss": 0.8302, "mean_token_accuracy": 0.7732916474342346, "num_tokens": 817429.0, "step": 22 }, { "epoch": 0.0029258364075817325, "ewc_loss": 1.4808028936386108e-07, "ewc_loss_diag": 2.0804691303055733e-11, "ewc_loss_parallel": 1.2732925824820995e-10, "grad_norm": 4.816972255706787, "learning_rate": 9.325985587113182e-09, "loss": 0.7638, "mean_token_accuracy": 0.7881088256835938, "num_tokens": 853964.0, "step": 23 }, { "epoch": 0.0030530466861722425, "ewc_loss": 1.6205012798309326e-07, "ewc_loss_diag": 2.2964741219766438e-11, "ewc_loss_parallel": 1.3915268937125802e-10, "grad_norm": 5.465129375457764, "learning_rate": 9.749894022891054e-09, "loss": 0.8276, "mean_token_accuracy": 0.7676252722740173, "num_tokens": 885070.0, "step": 24 }, { "epoch": 0.003180256964762753, "ewc_loss": 1.8067657947540283e-07, "ewc_loss_diag": 2.467004378559068e-11, "ewc_loss_parallel": 1.5552359400317073e-10, "grad_norm": 4.829807281494141, "learning_rate": 1.0173802458668929e-08, "loss": 0.88, "mean_token_accuracy": 0.7512437105178833, "num_tokens": 926893.0, "step": 25 }, { "epoch": 0.003307467243353263, "ewc_loss": 4.76837158203125e-07, "ewc_loss_diag": 8.321876521222293e-11, "ewc_loss_parallel": 3.92901711165905e-10, "grad_norm": 4.873714447021484, "learning_rate": 1.05977108944468e-08, "loss": 0.816, "mean_token_accuracy": 0.767375111579895, "num_tokens": 964773.0, "step": 26 }, { "epoch": 0.003434677521943773, "ewc_loss": 6.631016731262207e-07, "ewc_loss_diag": 1.0822986951097846e-10, "ewc_loss_parallel": 5.566107574850321e-10, "grad_norm": 4.744363784790039, "learning_rate": 1.1021619330224672e-08, "loss": 0.7692, "mean_token_accuracy": 0.7852367758750916, "num_tokens": 1002725.0, "step": 27 }, { "epoch": 0.003561887800534283, "ewc_loss": 7.934868335723877e-07, "ewc_loss_diag": 1.1550582712516189e-10, "ewc_loss_parallel": 6.766640581190586e-10, "grad_norm": 4.986883640289307, "learning_rate": 1.1445527766002543e-08, "loss": 0.8731, "mean_token_accuracy": 0.7573708891868591, "num_tokens": 1040296.0, "step": 28 }, { "epoch": 0.003689098079124793, "ewc_loss": 8.493661880493164e-07, "ewc_loss_diag": 1.255102688446641e-10, "ewc_loss_parallel": 7.239577826112509e-10, "grad_norm": 4.424383640289307, "learning_rate": 1.1869436201780416e-08, "loss": 0.7658, "mean_token_accuracy": 0.785591721534729, "num_tokens": 1081711.0, "step": 29 }, { "epoch": 0.0038163083577153036, "ewc_loss": 8.828938007354736e-07, "ewc_loss_diag": 1.3096723705530167e-10, "ewc_loss_parallel": 7.530616130679846e-10, "grad_norm": 4.810887336730957, "learning_rate": 1.2293344637558287e-08, "loss": 0.8121, "mean_token_accuracy": 0.7754683494567871, "num_tokens": 1120556.0, "step": 30 }, { "epoch": 0.003943518636305814, "ewc_loss": 9.387731552124023e-07, "ewc_loss_diag": 1.4733814168721437e-10, "ewc_loss_parallel": 7.930793799459934e-10, "grad_norm": 4.775334358215332, "learning_rate": 1.2717253073336159e-08, "loss": 0.7641, "mean_token_accuracy": 0.7900408506393433, "num_tokens": 1157723.0, "step": 31 }, { "epoch": 0.004070728914896324, "ewc_loss": 1.0281801223754883e-06, "ewc_loss_diag": 1.6370904631912708e-10, "ewc_loss_parallel": 8.62200977280736e-10, "grad_norm": 4.693875312805176, "learning_rate": 1.314116150911403e-08, "loss": 0.8102, "mean_token_accuracy": 0.7740100622177124, "num_tokens": 1197879.0, "step": 32 }, { "epoch": 0.004197939193486834, "ewc_loss": 1.0728836059570312e-06, "ewc_loss_diag": 1.7280399333685637e-10, "ewc_loss_parallel": 9.022187441587448e-10, "grad_norm": 4.80580472946167, "learning_rate": 1.3565069944891903e-08, "loss": 0.8103, "mean_token_accuracy": 0.7750473022460938, "num_tokens": 1237342.0, "step": 33 }, { "epoch": 0.004325149472077344, "ewc_loss": 1.169741153717041e-06, "ewc_loss_diag": 1.8189894035458565e-10, "ewc_loss_parallel": 9.89530235528946e-10, "grad_norm": 4.496280670166016, "learning_rate": 1.3988978380669775e-08, "loss": 0.782, "mean_token_accuracy": 0.7824699878692627, "num_tokens": 1280197.0, "step": 34 }, { "epoch": 0.004452359750667854, "ewc_loss": 1.519918441772461e-06, "ewc_loss_diag": 2.2646418074145913e-10, "ewc_loss_parallel": 1.2951204553246498e-09, "grad_norm": 4.656838417053223, "learning_rate": 1.4412886816447646e-08, "loss": 0.7642, "mean_token_accuracy": 0.785026490688324, "num_tokens": 1318625.0, "step": 35 }, { "epoch": 0.004579570029258364, "ewc_loss": 3.039836883544922e-06, "ewc_loss_diag": 6.039044819772243e-10, "ewc_loss_parallel": 2.4301698431372643e-09, "grad_norm": 4.813567161560059, "learning_rate": 1.4836795252225519e-08, "loss": 0.8219, "mean_token_accuracy": 0.7726645469665527, "num_tokens": 1356868.0, "step": 36 }, { "epoch": 0.004706780307848874, "ewc_loss": 4.082918167114258e-06, "ewc_loss_diag": 7.60337570682168e-10, "ewc_loss_parallel": 3.3178366720676422e-09, "grad_norm": 4.853723526000977, "learning_rate": 1.526070368800339e-08, "loss": 0.7936, "mean_token_accuracy": 0.780419111251831, "num_tokens": 1394696.0, "step": 37 }, { "epoch": 0.004833990586439385, "ewc_loss": 4.678964614868164e-06, "ewc_loss_diag": 8.403731044381857e-10, "ewc_loss_parallel": 3.841705620288849e-09, "grad_norm": 4.392285346984863, "learning_rate": 1.5684612123781262e-08, "loss": 0.7829, "mean_token_accuracy": 0.7820457816123962, "num_tokens": 1438738.0, "step": 38 }, { "epoch": 0.004961200865029895, "ewc_loss": 4.947185516357422e-06, "ewc_loss_diag": 9.022187441587448e-10, "ewc_loss_parallel": 4.045432433485985e-09, "grad_norm": 4.757291316986084, "learning_rate": 1.6108520559559135e-08, "loss": 0.7481, "mean_token_accuracy": 0.7923340797424316, "num_tokens": 1475089.0, "step": 39 }, { "epoch": 0.005088411143620405, "ewc_loss": 5.185604095458984e-06, "ewc_loss_diag": 9.313225746154785e-10, "ewc_loss_parallel": 4.249159246683121e-09, "grad_norm": 4.733593940734863, "learning_rate": 1.6532428995337004e-08, "loss": 0.8084, "mean_token_accuracy": 0.7748838663101196, "num_tokens": 1514566.0, "step": 40 }, { "epoch": 0.005215621422210915, "ewc_loss": 5.424022674560547e-06, "ewc_loss_diag": 9.677023626863956e-10, "ewc_loss_parallel": 4.48198989033699e-09, "grad_norm": 4.956955432891846, "learning_rate": 1.6956337431114877e-08, "loss": 0.8233, "mean_token_accuracy": 0.7735946774482727, "num_tokens": 1552560.0, "step": 41 }, { "epoch": 0.005342831700801425, "ewc_loss": 5.632638931274414e-06, "ewc_loss_diag": 1.0040821507573128e-09, "ewc_loss_parallel": 4.627509042620659e-09, "grad_norm": 5.399027347564697, "learning_rate": 1.738024586689275e-08, "loss": 0.8006, "mean_token_accuracy": 0.7768349647521973, "num_tokens": 1584759.0, "step": 42 }, { "epoch": 0.005470041979391935, "ewc_loss": 5.811452865600586e-06, "ewc_loss_diag": 1.0695657692849636e-09, "ewc_loss_parallel": 4.743924364447594e-09, "grad_norm": 4.847380638122559, "learning_rate": 1.7804154302670622e-08, "loss": 0.802, "mean_token_accuracy": 0.7756751775741577, "num_tokens": 1621825.0, "step": 43 }, { "epoch": 0.005597252257982445, "ewc_loss": 6.198883056640625e-06, "ewc_loss_diag": 1.178705133497715e-09, "ewc_loss_parallel": 5.034962669014931e-09, "grad_norm": 4.76943302154541, "learning_rate": 1.8228062738448494e-08, "loss": 0.8273, "mean_token_accuracy": 0.7675237059593201, "num_tokens": 1662946.0, "step": 44 }, { "epoch": 0.005724462536572955, "ewc_loss": 6.467103958129883e-06, "ewc_loss_diag": 1.2587406672537327e-09, "ewc_loss_parallel": 5.209585651755333e-09, "grad_norm": 4.678238868713379, "learning_rate": 1.8651971174226364e-08, "loss": 0.8153, "mean_token_accuracy": 0.7737489938735962, "num_tokens": 1699433.0, "step": 45 }, { "epoch": 0.005851672815163465, "ewc_loss": 6.854534149169922e-06, "ewc_loss_diag": 1.3096723705530167e-09, "ewc_loss_parallel": 5.529727786779404e-09, "grad_norm": 4.342339038848877, "learning_rate": 1.9075879610004236e-08, "loss": 0.7464, "mean_token_accuracy": 0.7884088754653931, "num_tokens": 1742812.0, "step": 46 }, { "epoch": 0.005978883093753975, "ewc_loss": 7.12275505065918e-06, "ewc_loss_diag": 1.3606040738523006e-09, "ewc_loss_parallel": 5.762558430433273e-09, "grad_norm": 4.870541095733643, "learning_rate": 1.949978804578211e-08, "loss": 0.8017, "mean_token_accuracy": 0.7786116003990173, "num_tokens": 1778725.0, "step": 47 }, { "epoch": 0.006106093372344485, "ewc_loss": 7.808208465576172e-06, "ewc_loss_diag": 1.4115357771515846e-09, "ewc_loss_parallel": 6.373738870024681e-09, "grad_norm": 4.824616432189941, "learning_rate": 1.9923696481559985e-08, "loss": 0.8479, "mean_token_accuracy": 0.7621228694915771, "num_tokens": 1816592.0, "step": 48 }, { "epoch": 0.006233303650934996, "ewc_loss": 9.894371032714844e-06, "ewc_loss_diag": 1.877197064459324e-09, "ewc_loss_parallel": 7.974449545145035e-09, "grad_norm": 4.430052757263184, "learning_rate": 2.0347604917337857e-08, "loss": 0.7399, "mean_token_accuracy": 0.7902880907058716, "num_tokens": 1859907.0, "step": 49 }, { "epoch": 0.006360513929525506, "ewc_loss": 1.5974044799804688e-05, "ewc_loss_diag": 3.6961864680051804e-09, "ewc_loss_parallel": 1.234002411365509e-08, "grad_norm": 4.723481178283691, "learning_rate": 2.0771513353115727e-08, "loss": 0.729, "mean_token_accuracy": 0.7998987436294556, "num_tokens": 1896627.0, "step": 50 }, { "epoch": 0.006487724208116016, "ewc_loss": 2.1338462829589844e-05, "ewc_loss_diag": 4.773028194904327e-09, "ewc_loss_parallel": 1.6530975699424744e-08, "grad_norm": 4.973272323608398, "learning_rate": 2.11954217888936e-08, "loss": 0.8446, "mean_token_accuracy": 0.7672643661499023, "num_tokens": 1934041.0, "step": 51 }, { "epoch": 0.006614934486706526, "ewc_loss": 2.4437904357910156e-05, "ewc_loss_diag": 5.209585651755333e-09, "ewc_loss_parallel": 1.9208528101444244e-08, "grad_norm": 4.568731784820557, "learning_rate": 2.1619330224671472e-08, "loss": 0.7738, "mean_token_accuracy": 0.7845152020454407, "num_tokens": 1976482.0, "step": 52 }, { "epoch": 0.006742144765297036, "ewc_loss": 2.7060508728027344e-05, "ewc_loss_diag": 5.558831617236137e-09, "ewc_loss_parallel": 2.1420419216156006e-08, "grad_norm": 5.285951614379883, "learning_rate": 2.2043238660449344e-08, "loss": 0.8032, "mean_token_accuracy": 0.7718637585639954, "num_tokens": 2009224.0, "step": 53 }, { "epoch": 0.006869355043887546, "ewc_loss": 2.9087066650390625e-05, "ewc_loss_diag": 5.762558430433273e-09, "ewc_loss_parallel": 2.3399479687213898e-08, "grad_norm": 4.8903303146362305, "learning_rate": 2.2467147096227214e-08, "loss": 0.8472, "mean_token_accuracy": 0.7620776891708374, "num_tokens": 2049235.0, "step": 54 }, { "epoch": 0.006996565322478056, "ewc_loss": 3.0517578125e-05, "ewc_loss_diag": 6.05359673500061e-09, "ewc_loss_parallel": 2.444721758365631e-08, "grad_norm": 4.599582195281982, "learning_rate": 2.2891055532005086e-08, "loss": 0.8604, "mean_token_accuracy": 0.7613122463226318, "num_tokens": 2090260.0, "step": 55 }, { "epoch": 0.007123775601068566, "ewc_loss": 3.0994415283203125e-05, "ewc_loss_diag": 6.257323548197746e-09, "ewc_loss_parallel": 2.4796463549137115e-08, "grad_norm": 5.000781536102295, "learning_rate": 2.331496396778296e-08, "loss": 0.8659, "mean_token_accuracy": 0.7606092691421509, "num_tokens": 2126686.0, "step": 56 }, { "epoch": 0.007250985879659076, "ewc_loss": 3.266334533691406e-05, "ewc_loss_diag": 6.461050361394882e-09, "ewc_loss_parallel": 2.60770320892334e-08, "grad_norm": 4.385902404785156, "learning_rate": 2.373887240356083e-08, "loss": 0.7677, "mean_token_accuracy": 0.7846575975418091, "num_tokens": 2171355.0, "step": 57 }, { "epoch": 0.007378196158249586, "ewc_loss": 3.2901763916015625e-05, "ewc_loss_diag": 6.577465683221817e-09, "ewc_loss_parallel": 2.6309862732887268e-08, "grad_norm": 4.596485137939453, "learning_rate": 2.4162780839338704e-08, "loss": 0.7409, "mean_token_accuracy": 0.7927523851394653, "num_tokens": 2211660.0, "step": 58 }, { "epoch": 0.007505406436840096, "ewc_loss": 3.2901763916015625e-05, "ewc_loss_diag": 6.7229848355054855e-09, "ewc_loss_parallel": 2.6193447411060333e-08, "grad_norm": 5.510993003845215, "learning_rate": 2.4586689275116573e-08, "loss": 0.8711, "mean_token_accuracy": 0.7588114142417908, "num_tokens": 2244411.0, "step": 59 }, { "epoch": 0.007632616715430607, "ewc_loss": 3.337860107421875e-05, "ewc_loss_diag": 7.0140231400728226e-09, "ewc_loss_parallel": 2.6426278054714203e-08, "grad_norm": 4.3686323165893555, "learning_rate": 2.5010597710894446e-08, "loss": 0.7977, "mean_token_accuracy": 0.770633339881897, "num_tokens": 2285253.0, "step": 60 }, { "epoch": 0.007759826994021117, "ewc_loss": 3.409385681152344e-05, "ewc_loss_diag": 7.392372936010361e-09, "ewc_loss_parallel": 2.6659108698368073e-08, "grad_norm": 4.500284671783447, "learning_rate": 2.5434506146672318e-08, "loss": 0.7622, "mean_token_accuracy": 0.7851353883743286, "num_tokens": 2328577.0, "step": 61 }, { "epoch": 0.007887037272611627, "ewc_loss": 3.457069396972656e-05, "ewc_loss_diag": 7.799826562404633e-09, "ewc_loss_parallel": 2.6659108698368073e-08, "grad_norm": 4.685747146606445, "learning_rate": 2.585841458245019e-08, "loss": 0.8422, "mean_token_accuracy": 0.7700719833374023, "num_tokens": 2366524.0, "step": 62 }, { "epoch": 0.008014247551202136, "ewc_loss": 3.528594970703125e-05, "ewc_loss_diag": 8.032657206058502e-09, "ewc_loss_parallel": 2.735760062932968e-08, "grad_norm": 4.224173069000244, "learning_rate": 2.628232301822806e-08, "loss": 0.7555, "mean_token_accuracy": 0.7898702025413513, "num_tokens": 2408628.0, "step": 63 }, { "epoch": 0.008141457829792647, "ewc_loss": 3.62396240234375e-05, "ewc_loss_diag": 8.32369551062584e-09, "ewc_loss_parallel": 2.782326191663742e-08, "grad_norm": 4.2982306480407715, "learning_rate": 2.6706231454005933e-08, "loss": 0.7952, "mean_token_accuracy": 0.7761989831924438, "num_tokens": 2451800.0, "step": 64 }, { "epoch": 0.008268668108383158, "ewc_loss": 3.719329833984375e-05, "ewc_loss_diag": 8.614733815193176e-09, "ewc_loss_parallel": 2.852175384759903e-08, "grad_norm": 5.658152103424072, "learning_rate": 2.7130139889783805e-08, "loss": 0.8663, "mean_token_accuracy": 0.7545405626296997, "num_tokens": 2481448.0, "step": 65 }, { "epoch": 0.008395878386973667, "ewc_loss": 3.886222839355469e-05, "ewc_loss_diag": 8.847564458847046e-09, "ewc_loss_parallel": 3.003515303134918e-08, "grad_norm": 4.2982707023620605, "learning_rate": 2.7554048325561678e-08, "loss": 0.7928, "mean_token_accuracy": 0.7754331827163696, "num_tokens": 2526339.0, "step": 66 }, { "epoch": 0.008523088665564178, "ewc_loss": 4.100799560546875e-05, "ewc_loss_diag": 9.138602763414383e-09, "ewc_loss_parallel": 3.189779818058014e-08, "grad_norm": 4.476857662200928, "learning_rate": 2.797795676133955e-08, "loss": 0.8179, "mean_token_accuracy": 0.7703796625137329, "num_tokens": 2570691.0, "step": 67 }, { "epoch": 0.008650298944154687, "ewc_loss": 4.673004150390625e-05, "ewc_loss_diag": 9.604264050722122e-09, "ewc_loss_parallel": 3.725290298461914e-08, "grad_norm": 4.634486675262451, "learning_rate": 2.840186519711742e-08, "loss": 0.8043, "mean_token_accuracy": 0.7751482725143433, "num_tokens": 2609207.0, "step": 68 }, { "epoch": 0.008777509222745198, "ewc_loss": 6.67572021484375e-05, "ewc_loss_diag": 1.391163095831871e-08, "ewc_loss_parallel": 5.2852556109428406e-08, "grad_norm": 4.993769645690918, "learning_rate": 2.8825773632895292e-08, "loss": 0.7927, "mean_token_accuracy": 0.7774702906608582, "num_tokens": 2645494.0, "step": 69 }, { "epoch": 0.008904719501335707, "ewc_loss": 9.775161743164062e-05, "ewc_loss_diag": 2.0605511963367462e-08, "ewc_loss_parallel": 7.729977369308472e-08, "grad_norm": 4.596004009246826, "learning_rate": 2.9249682068673165e-08, "loss": 0.7498, "mean_token_accuracy": 0.7915112972259521, "num_tokens": 2686258.0, "step": 70 }, { "epoch": 0.009031929779926218, "ewc_loss": 0.0001163482666015625, "ewc_loss_diag": 2.537854015827179e-08, "ewc_loss_parallel": 9.12696123123169e-08, "grad_norm": 5.109743595123291, "learning_rate": 2.9673590504451037e-08, "loss": 0.8045, "mean_token_accuracy": 0.7776104211807251, "num_tokens": 2720544.0, "step": 71 }, { "epoch": 0.009159140058516728, "ewc_loss": 0.00013065338134765625, "ewc_loss_diag": 2.8172507882118225e-08, "ewc_loss_parallel": 1.0244548320770264e-07, "grad_norm": 4.9124274253845215, "learning_rate": 3.0097498940228907e-08, "loss": 0.7676, "mean_token_accuracy": 0.7863144278526306, "num_tokens": 2758068.0, "step": 72 }, { "epoch": 0.009286350337107238, "ewc_loss": 0.0001392364501953125, "ewc_loss_diag": 3.003515303134918e-08, "ewc_loss_parallel": 1.0896474123001099e-07, "grad_norm": 5.090922832489014, "learning_rate": 3.052140737600678e-08, "loss": 0.8518, "mean_token_accuracy": 0.7629424333572388, "num_tokens": 2793342.0, "step": 73 }, { "epoch": 0.009413560615697748, "ewc_loss": 0.0001430511474609375, "ewc_loss_diag": 3.073364496231079e-08, "ewc_loss_parallel": 1.126900315284729e-07, "grad_norm": 5.264333724975586, "learning_rate": 3.094531581178465e-08, "loss": 0.8888, "mean_token_accuracy": 0.7551218271255493, "num_tokens": 2828003.0, "step": 74 }, { "epoch": 0.009540770894288259, "ewc_loss": 0.00014591217041015625, "ewc_loss_diag": 3.213062882423401e-08, "ewc_loss_parallel": 1.1362135410308838e-07, "grad_norm": 4.4032392501831055, "learning_rate": 3.1369224247562524e-08, "loss": 0.764, "mean_token_accuracy": 0.7836307287216187, "num_tokens": 2874755.0, "step": 75 }, { "epoch": 0.00966798117287877, "ewc_loss": 0.000148773193359375, "ewc_loss_diag": 3.3760443329811096e-08, "ewc_loss_parallel": 1.1548399925231934e-07, "grad_norm": 4.730258941650391, "learning_rate": 3.17931326833404e-08, "loss": 0.786, "mean_token_accuracy": 0.7771087884902954, "num_tokens": 2909703.0, "step": 76 }, { "epoch": 0.009795191451469279, "ewc_loss": 0.000152587890625, "ewc_loss_diag": 3.4226104617118835e-08, "ewc_loss_parallel": 1.1827796697616577e-07, "grad_norm": 4.2982378005981445, "learning_rate": 3.221704111911827e-08, "loss": 0.7164, "mean_token_accuracy": 0.7988524436950684, "num_tokens": 2954020.0, "step": 77 }, { "epoch": 0.00992240173005979, "ewc_loss": 0.000152587890625, "ewc_loss_diag": 3.4924596548080444e-08, "ewc_loss_parallel": 1.1734664440155029e-07, "grad_norm": 5.289749622344971, "learning_rate": 3.264094955489614e-08, "loss": 0.8115, "mean_token_accuracy": 0.7749365568161011, "num_tokens": 2984894.0, "step": 78 }, { "epoch": 0.010049612008650299, "ewc_loss": 0.0001544952392578125, "ewc_loss_diag": 3.4924596548080444e-08, "ewc_loss_parallel": 1.1920928955078125e-07, "grad_norm": 5.046064853668213, "learning_rate": 3.306485799067401e-08, "loss": 0.7189, "mean_token_accuracy": 0.7949134707450867, "num_tokens": 3017773.0, "step": 79 }, { "epoch": 0.01017682228724081, "ewc_loss": 0.00015544891357421875, "ewc_loss_diag": 3.5390257835388184e-08, "ewc_loss_parallel": 1.2014061212539673e-07, "grad_norm": 4.420782089233398, "learning_rate": 3.348876642645188e-08, "loss": 0.7179, "mean_token_accuracy": 0.7980462908744812, "num_tokens": 3059739.0, "step": 80 }, { "epoch": 0.010304032565831319, "ewc_loss": 0.00015544891357421875, "ewc_loss_diag": 3.632158041000366e-08, "ewc_loss_parallel": 1.1920928955078125e-07, "grad_norm": 4.948384761810303, "learning_rate": 3.391267486222975e-08, "loss": 0.8098, "mean_token_accuracy": 0.7714114189147949, "num_tokens": 3094910.0, "step": 81 }, { "epoch": 0.01043124284442183, "ewc_loss": 0.00015354156494140625, "ewc_loss_diag": 3.67872416973114e-08, "ewc_loss_parallel": 1.1688098311424255e-07, "grad_norm": 4.684697151184082, "learning_rate": 3.4336583298007626e-08, "loss": 0.8139, "mean_token_accuracy": 0.7736119627952576, "num_tokens": 3133884.0, "step": 82 }, { "epoch": 0.010558453123012339, "ewc_loss": 0.000152587890625, "ewc_loss_diag": 3.748573362827301e-08, "ewc_loss_parallel": 1.150183379650116e-07, "grad_norm": 5.208122730255127, "learning_rate": 3.47604917337855e-08, "loss": 0.8464, "mean_token_accuracy": 0.7605479955673218, "num_tokens": 3166755.0, "step": 83 }, { "epoch": 0.01068566340160285, "ewc_loss": 0.0001506805419921875, "ewc_loss_diag": 3.771856427192688e-08, "ewc_loss_parallel": 1.1362135410308838e-07, "grad_norm": 4.823515892028809, "learning_rate": 3.518440016956337e-08, "loss": 0.8152, "mean_token_accuracy": 0.77116858959198, "num_tokens": 3204825.0, "step": 84 }, { "epoch": 0.010812873680193359, "ewc_loss": 0.0001506805419921875, "ewc_loss_diag": 3.795139491558075e-08, "ewc_loss_parallel": 1.126900315284729e-07, "grad_norm": 4.802169322967529, "learning_rate": 3.5608308605341244e-08, "loss": 0.8504, "mean_token_accuracy": 0.7629399299621582, "num_tokens": 3244036.0, "step": 85 }, { "epoch": 0.01094008395878387, "ewc_loss": 0.00015163421630859375, "ewc_loss_diag": 3.888271749019623e-08, "ewc_loss_parallel": 1.126900315284729e-07, "grad_norm": 5.026459693908691, "learning_rate": 3.6032217041119116e-08, "loss": 0.838, "mean_token_accuracy": 0.7667479515075684, "num_tokens": 3285969.0, "step": 86 }, { "epoch": 0.01106729423737438, "ewc_loss": 0.000152587890625, "ewc_loss_diag": 4.0046870708465576e-08, "ewc_loss_parallel": 1.1315569281578064e-07, "grad_norm": 4.581179141998291, "learning_rate": 3.645612547689699e-08, "loss": 0.8093, "mean_token_accuracy": 0.7747766375541687, "num_tokens": 3327648.0, "step": 87 }, { "epoch": 0.01119450451596489, "ewc_loss": 0.00015544891357421875, "ewc_loss_diag": 4.1211023926734924e-08, "ewc_loss_parallel": 1.1408701539039612e-07, "grad_norm": 4.706473350524902, "learning_rate": 3.6880033912674855e-08, "loss": 0.8123, "mean_token_accuracy": 0.7727906703948975, "num_tokens": 3367399.0, "step": 88 }, { "epoch": 0.0113217147945554, "ewc_loss": 0.00015735626220703125, "ewc_loss_diag": 4.21423465013504e-08, "ewc_loss_parallel": 1.1548399925231934e-07, "grad_norm": 4.889590263366699, "learning_rate": 3.730394234845273e-08, "loss": 0.7955, "mean_token_accuracy": 0.7797234654426575, "num_tokens": 3405402.0, "step": 89 }, { "epoch": 0.01144892507314591, "ewc_loss": 0.00016117095947265625, "ewc_loss_diag": 4.353933036327362e-08, "ewc_loss_parallel": 1.1781230568885803e-07, "grad_norm": 4.991866588592529, "learning_rate": 3.77278507842306e-08, "loss": 0.8789, "mean_token_accuracy": 0.7541574835777283, "num_tokens": 3441791.0, "step": 90 }, { "epoch": 0.01157613535173642, "ewc_loss": 0.0001659393310546875, "ewc_loss_diag": 4.423782229423523e-08, "ewc_loss_parallel": 1.210719347000122e-07, "grad_norm": 4.957711696624756, "learning_rate": 3.815175922000847e-08, "loss": 0.8386, "mean_token_accuracy": 0.7687037587165833, "num_tokens": 3480151.0, "step": 91 }, { "epoch": 0.01170334563032693, "ewc_loss": 0.0001697540283203125, "ewc_loss_diag": 4.516914486885071e-08, "ewc_loss_parallel": 1.2479722499847412e-07, "grad_norm": 4.856903076171875, "learning_rate": 3.8575667655786345e-08, "loss": 0.8019, "mean_token_accuracy": 0.7757307887077332, "num_tokens": 3516867.0, "step": 92 }, { "epoch": 0.01183055590891744, "ewc_loss": 0.00018024444580078125, "ewc_loss_diag": 4.6798959374427795e-08, "ewc_loss_parallel": 1.3317912817001343e-07, "grad_norm": 4.485822677612305, "learning_rate": 3.899957609156422e-08, "loss": 0.7781, "mean_token_accuracy": 0.7825303673744202, "num_tokens": 3557466.0, "step": 93 }, { "epoch": 0.01195776618750795, "ewc_loss": 0.0001926422119140625, "ewc_loss_diag": 4.7963112592697144e-08, "ewc_loss_parallel": 1.4528632164001465e-07, "grad_norm": 4.695289134979248, "learning_rate": 3.94234845273421e-08, "loss": 0.8074, "mean_token_accuracy": 0.7728368043899536, "num_tokens": 3596009.0, "step": 94 }, { "epoch": 0.012084976466098461, "ewc_loss": 0.000217437744140625, "ewc_loss_diag": 5.005858838558197e-08, "ewc_loss_parallel": 1.6763806343078613e-07, "grad_norm": 4.934337615966797, "learning_rate": 3.984739296311997e-08, "loss": 0.7995, "mean_token_accuracy": 0.7745263576507568, "num_tokens": 3631582.0, "step": 95 }, { "epoch": 0.01221218674468897, "ewc_loss": 0.0002727508544921875, "ewc_loss_diag": 5.3551048040390015e-08, "ewc_loss_parallel": 2.1886080503463745e-07, "grad_norm": 4.973684310913086, "learning_rate": 4.027130139889784e-08, "loss": 0.8265, "mean_token_accuracy": 0.7677362561225891, "num_tokens": 3672855.0, "step": 96 }, { "epoch": 0.012339397023279481, "ewc_loss": 0.000354766845703125, "ewc_loss_diag": 6.007030606269836e-08, "ewc_loss_parallel": 2.9616057872772217e-07, "grad_norm": 4.925044536590576, "learning_rate": 4.0695209834675715e-08, "loss": 0.8183, "mean_token_accuracy": 0.772737979888916, "num_tokens": 3714100.0, "step": 97 }, { "epoch": 0.012466607301869992, "ewc_loss": 0.000469207763671875, "ewc_loss_diag": 7.310882210731506e-08, "ewc_loss_parallel": 3.9674341678619385e-07, "grad_norm": 5.161657810211182, "learning_rate": 4.111911827045358e-08, "loss": 0.7696, "mean_token_accuracy": 0.7853796482086182, "num_tokens": 3752834.0, "step": 98 }, { "epoch": 0.012593817580460501, "ewc_loss": 0.0005645751953125, "ewc_loss_diag": 8.568167686462402e-08, "ewc_loss_parallel": 4.805624485015869e-07, "grad_norm": 5.188429832458496, "learning_rate": 4.154302670623145e-08, "loss": 0.7799, "mean_token_accuracy": 0.7811051607131958, "num_tokens": 3791513.0, "step": 99 }, { "epoch": 0.012721027859051012, "ewc_loss": 0.00058746337890625, "ewc_loss_diag": 9.499490261077881e-08, "ewc_loss_parallel": 4.917383193969727e-07, "grad_norm": 5.582276344299316, "learning_rate": 4.1966935142009326e-08, "loss": 0.8033, "mean_token_accuracy": 0.7739445567131042, "num_tokens": 3825085.0, "step": 100 }, { "epoch": 0.012848238137641521, "ewc_loss": 0.0005950927734375, "ewc_loss_diag": 1.019798219203949e-07, "ewc_loss_parallel": 4.917383193969727e-07, "grad_norm": 5.107059955596924, "learning_rate": 4.23908435777872e-08, "loss": 0.8028, "mean_token_accuracy": 0.7766926884651184, "num_tokens": 3866492.0, "step": 101 }, { "epoch": 0.012975448416232032, "ewc_loss": 0.000591278076171875, "ewc_loss_diag": 1.0617077350616455e-07, "ewc_loss_parallel": 4.842877388000488e-07, "grad_norm": 5.360418319702148, "learning_rate": 4.281475201356507e-08, "loss": 0.8301, "mean_token_accuracy": 0.7633270025253296, "num_tokens": 3904053.0, "step": 102 }, { "epoch": 0.013102658694822541, "ewc_loss": 0.000591278076171875, "ewc_loss_diag": 1.1175870895385742e-07, "ewc_loss_parallel": 4.805624485015869e-07, "grad_norm": 5.399483680725098, "learning_rate": 4.3238660449342943e-08, "loss": 0.7385, "mean_token_accuracy": 0.7948514223098755, "num_tokens": 3935118.0, "step": 103 }, { "epoch": 0.013229868973413052, "ewc_loss": 0.0005950927734375, "ewc_loss_diag": 1.1455267667770386e-07, "ewc_loss_parallel": 4.805624485015869e-07, "grad_norm": 5.034303188323975, "learning_rate": 4.3662568885120816e-08, "loss": 0.7887, "mean_token_accuracy": 0.7762508392333984, "num_tokens": 3977899.0, "step": 104 }, { "epoch": 0.013357079252003561, "ewc_loss": 0.00060272216796875, "ewc_loss_diag": 1.1688098311424255e-07, "ewc_loss_parallel": 4.842877388000488e-07, "grad_norm": 5.052534103393555, "learning_rate": 4.408647732089869e-08, "loss": 0.8092, "mean_token_accuracy": 0.773781955242157, "num_tokens": 4019416.0, "step": 105 }, { "epoch": 0.013484289530594072, "ewc_loss": 0.000606536865234375, "ewc_loss_diag": 1.1734664440155029e-07, "ewc_loss_parallel": 4.880130290985107e-07, "grad_norm": 5.046960830688477, "learning_rate": 4.451038575667656e-08, "loss": 0.7978, "mean_token_accuracy": 0.7726414799690247, "num_tokens": 4063887.0, "step": 106 }, { "epoch": 0.013611499809184581, "ewc_loss": 0.00060272216796875, "ewc_loss_diag": 1.1920928955078125e-07, "ewc_loss_parallel": 4.842877388000488e-07, "grad_norm": 5.133779048919678, "learning_rate": 4.493429419245443e-08, "loss": 0.829, "mean_token_accuracy": 0.7670596241950989, "num_tokens": 4101309.0, "step": 107 }, { "epoch": 0.013738710087775092, "ewc_loss": 0.00060272216796875, "ewc_loss_diag": 1.2014061212539673e-07, "ewc_loss_parallel": 4.842877388000488e-07, "grad_norm": 5.254006385803223, "learning_rate": 4.53582026282323e-08, "loss": 0.8133, "mean_token_accuracy": 0.770664393901825, "num_tokens": 4136669.0, "step": 108 }, { "epoch": 0.013865920366365603, "ewc_loss": 0.0005950927734375, "ewc_loss_diag": 1.1920928955078125e-07, "ewc_loss_parallel": 4.76837158203125e-07, "grad_norm": 4.90879487991333, "learning_rate": 4.578211106401017e-08, "loss": 0.7441, "mean_token_accuracy": 0.7899408340454102, "num_tokens": 4178583.0, "step": 109 }, { "epoch": 0.013993130644956112, "ewc_loss": 0.00058746337890625, "ewc_loss_diag": 1.210719347000122e-07, "ewc_loss_parallel": 4.675239324569702e-07, "grad_norm": 4.907901763916016, "learning_rate": 4.6206019499788045e-08, "loss": 0.7747, "mean_token_accuracy": 0.7772694826126099, "num_tokens": 4219552.0, "step": 110 }, { "epoch": 0.014120340923546623, "ewc_loss": 0.000579833984375, "ewc_loss_diag": 1.2293457984924316e-07, "ewc_loss_parallel": 4.5634806156158447e-07, "grad_norm": 4.918818950653076, "learning_rate": 4.662992793556592e-08, "loss": 0.7673, "mean_token_accuracy": 0.7852778434753418, "num_tokens": 4264487.0, "step": 111 }, { "epoch": 0.014247551202137132, "ewc_loss": 0.0005645751953125, "ewc_loss_diag": 1.2386590242385864e-07, "ewc_loss_parallel": 4.4330954551696777e-07, "grad_norm": 4.979314804077148, "learning_rate": 4.705383637134379e-08, "loss": 0.7336, "mean_token_accuracy": 0.7897274494171143, "num_tokens": 4303015.0, "step": 112 }, { "epoch": 0.014374761480727643, "ewc_loss": 0.00055694580078125, "ewc_loss_diag": 1.2479722499847412e-07, "ewc_loss_parallel": 4.3213367462158203e-07, "grad_norm": 4.918485164642334, "learning_rate": 4.747774480712166e-08, "loss": 0.7885, "mean_token_accuracy": 0.7765676379203796, "num_tokens": 4345446.0, "step": 113 }, { "epoch": 0.014501971759318152, "ewc_loss": 0.000545501708984375, "ewc_loss_diag": 1.2479722499847412e-07, "ewc_loss_parallel": 4.209578037261963e-07, "grad_norm": 4.9859418869018555, "learning_rate": 4.7901653242899535e-08, "loss": 0.7408, "mean_token_accuracy": 0.7911738753318787, "num_tokens": 4383854.0, "step": 114 }, { "epoch": 0.014629182037908663, "ewc_loss": 0.000537872314453125, "ewc_loss_diag": 1.2665987014770508e-07, "ewc_loss_parallel": 4.0978193283081055e-07, "grad_norm": 5.013106346130371, "learning_rate": 4.832556167867741e-08, "loss": 0.8099, "mean_token_accuracy": 0.7727905511856079, "num_tokens": 4420750.0, "step": 115 }, { "epoch": 0.014756392316499172, "ewc_loss": 0.0005340576171875, "ewc_loss_diag": 1.2759119272232056e-07, "ewc_loss_parallel": 4.0605664253234863e-07, "grad_norm": 5.132528781890869, "learning_rate": 4.8749470114455274e-08, "loss": 0.8067, "mean_token_accuracy": 0.7730497121810913, "num_tokens": 4458761.0, "step": 116 }, { "epoch": 0.014883602595089683, "ewc_loss": 0.00052642822265625, "ewc_loss_diag": 1.2759119272232056e-07, "ewc_loss_parallel": 3.986060619354248e-07, "grad_norm": 4.982077121734619, "learning_rate": 4.9173378550233146e-08, "loss": 0.767, "mean_token_accuracy": 0.7807592153549194, "num_tokens": 4496547.0, "step": 117 }, { "epoch": 0.015010812873680193, "ewc_loss": 0.000522613525390625, "ewc_loss_diag": 1.2852251529693604e-07, "ewc_loss_parallel": 3.948807716369629e-07, "grad_norm": 5.008535861968994, "learning_rate": 4.959728698601102e-08, "loss": 0.7653, "mean_token_accuracy": 0.7891691327095032, "num_tokens": 4533357.0, "step": 118 }, { "epoch": 0.015138023152270703, "ewc_loss": 0.000522613525390625, "ewc_loss_diag": 1.30385160446167e-07, "ewc_loss_parallel": 3.9301812648773193e-07, "grad_norm": 4.881608009338379, "learning_rate": 5.002119542178889e-08, "loss": 0.7399, "mean_token_accuracy": 0.7905857563018799, "num_tokens": 4573570.0, "step": 119 }, { "epoch": 0.015265233430861214, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 3.818422555923462e-07, "grad_norm": 5.024876594543457, "learning_rate": 5.0445103857566764e-08, "loss": 0.795, "mean_token_accuracy": 0.7748896479606628, "num_tokens": 4613195.0, "step": 120 }, { "epoch": 0.015392443709451724, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.3504177331924438e-07, "ewc_loss_parallel": 3.762543201446533e-07, "grad_norm": 4.784104347229004, "learning_rate": 5.0869012293344637e-08, "loss": 0.7152, "mean_token_accuracy": 0.7978031039237976, "num_tokens": 4657230.0, "step": 121 }, { "epoch": 0.015519653988042234, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.3597309589385986e-07, "ewc_loss_parallel": 3.7439167499542236e-07, "grad_norm": 4.890507221221924, "learning_rate": 5.129292072912251e-08, "loss": 0.7891, "mean_token_accuracy": 0.7802785634994507, "num_tokens": 4701436.0, "step": 122 }, { "epoch": 0.015646864266632744, "ewc_loss": 0.000507354736328125, "ewc_loss_diag": 1.3783574104309082e-07, "ewc_loss_parallel": 3.688037395477295e-07, "grad_norm": 4.9851226806640625, "learning_rate": 5.171682916490038e-08, "loss": 0.7767, "mean_token_accuracy": 0.7796653509140015, "num_tokens": 4741806.0, "step": 123 }, { "epoch": 0.015774074545223254, "ewc_loss": 0.0005035400390625, "ewc_loss_diag": 1.3969838619232178e-07, "ewc_loss_parallel": 3.6694109439849854e-07, "grad_norm": 5.118183135986328, "learning_rate": 5.2140737600678254e-08, "loss": 0.8139, "mean_token_accuracy": 0.7651013731956482, "num_tokens": 4780671.0, "step": 124 }, { "epoch": 0.015901284823813765, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.4156103134155273e-07, "ewc_loss_parallel": 3.7066638469696045e-07, "grad_norm": 5.123435020446777, "learning_rate": 5.256464603645612e-08, "loss": 0.7625, "mean_token_accuracy": 0.7864018678665161, "num_tokens": 4816879.0, "step": 125 }, { "epoch": 0.016028495102404273, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.434236764907837e-07, "ewc_loss_parallel": 3.688037395477295e-07, "grad_norm": 5.0159783363342285, "learning_rate": 5.298855447223399e-08, "loss": 0.8151, "mean_token_accuracy": 0.7674744129180908, "num_tokens": 4858704.0, "step": 126 }, { "epoch": 0.016155705380994784, "ewc_loss": 0.00051116943359375, "ewc_loss_diag": 1.4435499906539917e-07, "ewc_loss_parallel": 3.688037395477295e-07, "grad_norm": 5.034819602966309, "learning_rate": 5.3412462908011865e-08, "loss": 0.7723, "mean_token_accuracy": 0.7818529605865479, "num_tokens": 4896780.0, "step": 127 }, { "epoch": 0.016282915659585295, "ewc_loss": 0.000522613525390625, "ewc_loss_diag": 1.4994293451309204e-07, "ewc_loss_parallel": 3.725290298461914e-07, "grad_norm": 5.21068000793457, "learning_rate": 5.383637134378974e-08, "loss": 0.8021, "mean_token_accuracy": 0.7720232009887695, "num_tokens": 4932108.0, "step": 128 }, { "epoch": 0.016410125938175806, "ewc_loss": 0.000530242919921875, "ewc_loss_diag": 1.5273690223693848e-07, "ewc_loss_parallel": 3.781169652938843e-07, "grad_norm": 4.977545738220215, "learning_rate": 5.426027977956761e-08, "loss": 0.7785, "mean_token_accuracy": 0.7815446853637695, "num_tokens": 4970384.0, "step": 129 }, { "epoch": 0.016537336216766316, "ewc_loss": 0.000545501708984375, "ewc_loss_diag": 1.5832483768463135e-07, "ewc_loss_parallel": 3.8743019104003906e-07, "grad_norm": 5.251943588256836, "learning_rate": 5.468418821534548e-08, "loss": 0.7873, "mean_token_accuracy": 0.7765308618545532, "num_tokens": 5005996.0, "step": 130 }, { "epoch": 0.016664546495356824, "ewc_loss": 0.000568389892578125, "ewc_loss_diag": 1.6298145055770874e-07, "ewc_loss_parallel": 4.041939973831177e-07, "grad_norm": 5.085931777954102, "learning_rate": 5.5108096651123356e-08, "loss": 0.7673, "mean_token_accuracy": 0.7810689210891724, "num_tokens": 5044208.0, "step": 131 }, { "epoch": 0.016791756773947335, "ewc_loss": 0.0005950927734375, "ewc_loss_diag": 1.6763806343078613e-07, "ewc_loss_parallel": 4.284083843231201e-07, "grad_norm": 5.116197109222412, "learning_rate": 5.553200508690123e-08, "loss": 0.7821, "mean_token_accuracy": 0.7757670879364014, "num_tokens": 5085173.0, "step": 132 }, { "epoch": 0.016918967052537846, "ewc_loss": 0.000637054443359375, "ewc_loss_diag": 1.7415732145309448e-07, "ewc_loss_parallel": 4.6193599700927734e-07, "grad_norm": 5.376977443695068, "learning_rate": 5.59559135226791e-08, "loss": 0.7993, "mean_token_accuracy": 0.773490309715271, "num_tokens": 5118440.0, "step": 133 }, { "epoch": 0.017046177331128357, "ewc_loss": 0.000675201416015625, "ewc_loss_diag": 1.7881393432617188e-07, "ewc_loss_parallel": 4.954636096954346e-07, "grad_norm": 5.464428901672363, "learning_rate": 5.637982195845697e-08, "loss": 0.8038, "mean_token_accuracy": 0.7740107774734497, "num_tokens": 5155202.0, "step": 134 }, { "epoch": 0.017173387609718864, "ewc_loss": 0.000728607177734375, "ewc_loss_diag": 1.8440186977386475e-07, "ewc_loss_parallel": 5.438923835754395e-07, "grad_norm": 5.3543853759765625, "learning_rate": 5.680373039423484e-08, "loss": 0.7901, "mean_token_accuracy": 0.7788395285606384, "num_tokens": 5193240.0, "step": 135 }, { "epoch": 0.017300597888309375, "ewc_loss": 0.00081634521484375, "ewc_loss_diag": 1.955777406692505e-07, "ewc_loss_parallel": 6.221234798431396e-07, "grad_norm": 5.5439348220825195, "learning_rate": 5.722763883001271e-08, "loss": 0.7787, "mean_token_accuracy": 0.7801821231842041, "num_tokens": 5230975.0, "step": 136 }, { "epoch": 0.017427808166899886, "ewc_loss": 0.00090789794921875, "ewc_loss_diag": 2.0489096641540527e-07, "ewc_loss_parallel": 7.003545761108398e-07, "grad_norm": 5.901668071746826, "learning_rate": 5.7651547265790585e-08, "loss": 0.8419, "mean_token_accuracy": 0.7636724710464478, "num_tokens": 5264786.0, "step": 137 }, { "epoch": 0.017555018445490397, "ewc_loss": 0.00102996826171875, "ewc_loss_diag": 2.1979212760925293e-07, "ewc_loss_parallel": 8.083879947662354e-07, "grad_norm": 5.707575798034668, "learning_rate": 5.807545570156846e-08, "loss": 0.8043, "mean_token_accuracy": 0.7728018164634705, "num_tokens": 5301554.0, "step": 138 }, { "epoch": 0.017682228724080904, "ewc_loss": 0.0011749267578125, "ewc_loss_diag": 2.3562461137771606e-07, "ewc_loss_parallel": 9.387731552124023e-07, "grad_norm": 5.70512580871582, "learning_rate": 5.849936413734633e-08, "loss": 0.7437, "mean_token_accuracy": 0.789516806602478, "num_tokens": 5337356.0, "step": 139 }, { "epoch": 0.017809439002671415, "ewc_loss": 0.0012664794921875, "ewc_loss_diag": 2.4028122425079346e-07, "ewc_loss_parallel": 1.0207295417785645e-06, "grad_norm": 5.794168472290039, "learning_rate": 5.89232725731242e-08, "loss": 0.8143, "mean_token_accuracy": 0.7731024622917175, "num_tokens": 5375767.0, "step": 140 }, { "epoch": 0.017936649281261926, "ewc_loss": 0.00136566162109375, "ewc_loss_diag": 2.4586915969848633e-07, "ewc_loss_parallel": 1.125037670135498e-06, "grad_norm": 5.7679219245910645, "learning_rate": 5.9347181008902075e-08, "loss": 0.7651, "mean_token_accuracy": 0.781966507434845, "num_tokens": 5413683.0, "step": 141 }, { "epoch": 0.018063859559852437, "ewc_loss": 0.00146484375, "ewc_loss_diag": 2.551823854446411e-07, "ewc_loss_parallel": 1.2069940567016602e-06, "grad_norm": 5.579967021942139, "learning_rate": 5.977108944467995e-08, "loss": 0.7457, "mean_token_accuracy": 0.7875468730926514, "num_tokens": 5457750.0, "step": 142 }, { "epoch": 0.018191069838442948, "ewc_loss": 0.00151824951171875, "ewc_loss_diag": 2.60770320892334e-07, "ewc_loss_parallel": 1.259148120880127e-06, "grad_norm": 5.793055534362793, "learning_rate": 6.019499788045781e-08, "loss": 0.7409, "mean_token_accuracy": 0.7885934114456177, "num_tokens": 5495743.0, "step": 143 }, { "epoch": 0.018318280117033455, "ewc_loss": 0.001556396484375, "ewc_loss_diag": 2.6263296604156494e-07, "ewc_loss_parallel": 1.2889504432678223e-06, "grad_norm": 5.877959728240967, "learning_rate": 6.061890631623569e-08, "loss": 0.7539, "mean_token_accuracy": 0.7810723781585693, "num_tokens": 5532198.0, "step": 144 }, { "epoch": 0.018445490395623966, "ewc_loss": 0.0015869140625, "ewc_loss_diag": 2.738088369369507e-07, "ewc_loss_parallel": 1.3113021850585938e-06, "grad_norm": 5.976709842681885, "learning_rate": 6.104281475201356e-08, "loss": 0.7667, "mean_token_accuracy": 0.7817153334617615, "num_tokens": 5568977.0, "step": 145 }, { "epoch": 0.018572700674214477, "ewc_loss": 0.0016632080078125, "ewc_loss_diag": 2.868473529815674e-07, "ewc_loss_parallel": 1.3709068298339844e-06, "grad_norm": 5.948916912078857, "learning_rate": 6.146672318779143e-08, "loss": 0.7787, "mean_token_accuracy": 0.7727857232093811, "num_tokens": 5606229.0, "step": 146 }, { "epoch": 0.018699910952804988, "ewc_loss": 0.001708984375, "ewc_loss_diag": 2.998858690261841e-07, "ewc_loss_parallel": 1.4081597328186035e-06, "grad_norm": 5.867446422576904, "learning_rate": 6.18906316235693e-08, "loss": 0.7793, "mean_token_accuracy": 0.7753987312316895, "num_tokens": 5649828.0, "step": 147 }, { "epoch": 0.018827121231395495, "ewc_loss": 0.001739501953125, "ewc_loss_diag": 3.0919909477233887e-07, "ewc_loss_parallel": 1.4230608940124512e-06, "grad_norm": 6.0127668380737305, "learning_rate": 6.231454005934718e-08, "loss": 0.7261, "mean_token_accuracy": 0.7950402498245239, "num_tokens": 5687433.0, "step": 148 }, { "epoch": 0.018954331509986006, "ewc_loss": 0.00173187255859375, "ewc_loss_diag": 3.110617399215698e-07, "ewc_loss_parallel": 1.4156103134155273e-06, "grad_norm": 6.109597206115723, "learning_rate": 6.273844849512505e-08, "loss": 0.7567, "mean_token_accuracy": 0.7823126316070557, "num_tokens": 5723247.0, "step": 149 }, { "epoch": 0.019081541788576517, "ewc_loss": 0.00174713134765625, "ewc_loss_diag": 3.203749656677246e-07, "ewc_loss_parallel": 1.4230608940124512e-06, "grad_norm": 6.107264995574951, "learning_rate": 6.316235693090292e-08, "loss": 0.771, "mean_token_accuracy": 0.7780706882476807, "num_tokens": 5760305.0, "step": 150 }, { "epoch": 0.019208752067167028, "ewc_loss": 0.00174713134765625, "ewc_loss_diag": 3.2782554626464844e-07, "ewc_loss_parallel": 1.4156103134155273e-06, "grad_norm": 5.999269962310791, "learning_rate": 6.35862653666808e-08, "loss": 0.771, "mean_token_accuracy": 0.7762397527694702, "num_tokens": 5800586.0, "step": 151 }, { "epoch": 0.01933596234575754, "ewc_loss": 0.001739501953125, "ewc_loss_diag": 3.3155083656311035e-07, "ewc_loss_parallel": 1.4081597328186035e-06, "grad_norm": 5.991140842437744, "learning_rate": 6.401017380245867e-08, "loss": 0.7624, "mean_token_accuracy": 0.7783851027488708, "num_tokens": 5840351.0, "step": 152 }, { "epoch": 0.019463172624348046, "ewc_loss": 0.001739501953125, "ewc_loss_diag": 3.334134817123413e-07, "ewc_loss_parallel": 1.4007091522216797e-06, "grad_norm": 6.012501239776611, "learning_rate": 6.443408223823654e-08, "loss": 0.8031, "mean_token_accuracy": 0.7698224782943726, "num_tokens": 5880162.0, "step": 153 }, { "epoch": 0.019590382902938557, "ewc_loss": 0.0017242431640625, "ewc_loss_diag": 3.390014171600342e-07, "ewc_loss_parallel": 1.385807991027832e-06, "grad_norm": 5.97807502746582, "learning_rate": 6.485799067401441e-08, "loss": 0.8585, "mean_token_accuracy": 0.7542816996574402, "num_tokens": 5922213.0, "step": 154 }, { "epoch": 0.019717593181529068, "ewc_loss": 0.00171661376953125, "ewc_loss_diag": 3.427267074584961e-07, "ewc_loss_parallel": 1.3709068298339844e-06, "grad_norm": 5.9360270500183105, "learning_rate": 6.528189910979228e-08, "loss": 0.7404, "mean_token_accuracy": 0.7848209738731384, "num_tokens": 5957461.0, "step": 155 }, { "epoch": 0.01984480346011958, "ewc_loss": 0.001708984375, "ewc_loss_diag": 3.520399332046509e-07, "ewc_loss_parallel": 1.3560056686401367e-06, "grad_norm": 5.873482704162598, "learning_rate": 6.570580754557016e-08, "loss": 0.725, "mean_token_accuracy": 0.7850289940834045, "num_tokens": 5994675.0, "step": 156 }, { "epoch": 0.019972013738710086, "ewc_loss": 0.0016937255859375, "ewc_loss_diag": 3.5762786865234375e-07, "ewc_loss_parallel": 1.3336539268493652e-06, "grad_norm": 5.783242225646973, "learning_rate": 6.612971598134802e-08, "loss": 0.688, "mean_token_accuracy": 0.800718367099762, "num_tokens": 6032514.0, "step": 157 }, { "epoch": 0.020099224017300597, "ewc_loss": 0.0016632080078125, "ewc_loss_diag": 3.5762786865234375e-07, "ewc_loss_parallel": 1.30385160446167e-06, "grad_norm": 5.8661065101623535, "learning_rate": 6.655362441712589e-08, "loss": 0.7603, "mean_token_accuracy": 0.7791225910186768, "num_tokens": 6069160.0, "step": 158 }, { "epoch": 0.020226434295891108, "ewc_loss": 0.0016632080078125, "ewc_loss_diag": 3.632158041000366e-07, "ewc_loss_parallel": 1.296401023864746e-06, "grad_norm": 5.928823947906494, "learning_rate": 6.697753285290376e-08, "loss": 0.8144, "mean_token_accuracy": 0.7634865045547485, "num_tokens": 6106652.0, "step": 159 }, { "epoch": 0.02035364457448162, "ewc_loss": 0.00164031982421875, "ewc_loss_diag": 3.650784492492676e-07, "ewc_loss_parallel": 1.2740492820739746e-06, "grad_norm": 5.838155746459961, "learning_rate": 6.740144128868163e-08, "loss": 0.749, "mean_token_accuracy": 0.7844212055206299, "num_tokens": 6143070.0, "step": 160 }, { "epoch": 0.020480854853072127, "ewc_loss": 0.0016021728515625, "ewc_loss_diag": 3.688037395477295e-07, "ewc_loss_parallel": 1.2367963790893555e-06, "grad_norm": 5.653575420379639, "learning_rate": 6.78253497244595e-08, "loss": 0.7409, "mean_token_accuracy": 0.7859700918197632, "num_tokens": 6186495.0, "step": 161 }, { "epoch": 0.020608065131662637, "ewc_loss": 0.00157928466796875, "ewc_loss_diag": 3.7439167499542236e-07, "ewc_loss_parallel": 1.2069940567016602e-06, "grad_norm": 5.855959892272949, "learning_rate": 6.824925816023738e-08, "loss": 0.7418, "mean_token_accuracy": 0.785132646560669, "num_tokens": 6222454.0, "step": 162 }, { "epoch": 0.02073527541025315, "ewc_loss": 0.0015716552734375, "ewc_loss_diag": 3.7439167499542236e-07, "ewc_loss_parallel": 1.1995434761047363e-06, "grad_norm": 6.092164039611816, "learning_rate": 6.867316659601525e-08, "loss": 0.7553, "mean_token_accuracy": 0.7841020822525024, "num_tokens": 6253260.0, "step": 163 }, { "epoch": 0.02086248568884366, "ewc_loss": 0.0015716552734375, "ewc_loss_diag": 3.7439167499542236e-07, "ewc_loss_parallel": 1.1995434761047363e-06, "grad_norm": 5.721667766571045, "learning_rate": 6.909707503179312e-08, "loss": 0.7347, "mean_token_accuracy": 0.7888060808181763, "num_tokens": 6294773.0, "step": 164 }, { "epoch": 0.02098969596743417, "ewc_loss": 0.00156402587890625, "ewc_loss_diag": 3.818422555923462e-07, "ewc_loss_parallel": 1.1846423149108887e-06, "grad_norm": 5.773577690124512, "learning_rate": 6.9520983467571e-08, "loss": 0.7787, "mean_token_accuracy": 0.7749453186988831, "num_tokens": 6335334.0, "step": 165 }, { "epoch": 0.021116906246024678, "ewc_loss": 0.0015411376953125, "ewc_loss_diag": 3.855675458908081e-07, "ewc_loss_parallel": 1.1548399925231934e-06, "grad_norm": 5.755454063415527, "learning_rate": 6.994489190334887e-08, "loss": 0.7144, "mean_token_accuracy": 0.790730357170105, "num_tokens": 6372255.0, "step": 166 }, { "epoch": 0.02124411652461519, "ewc_loss": 0.00151824951171875, "ewc_loss_diag": 3.91155481338501e-07, "ewc_loss_parallel": 1.1324882507324219e-06, "grad_norm": 5.831704616546631, "learning_rate": 7.036880033912674e-08, "loss": 0.6903, "mean_token_accuracy": 0.7992942929267883, "num_tokens": 6411971.0, "step": 167 }, { "epoch": 0.0213713268032057, "ewc_loss": 0.0015106201171875, "ewc_loss_diag": 3.9301812648773193e-07, "ewc_loss_parallel": 1.125037670135498e-06, "grad_norm": 5.928907871246338, "learning_rate": 7.079270877490461e-08, "loss": 0.7615, "mean_token_accuracy": 0.7788702249526978, "num_tokens": 6450366.0, "step": 168 }, { "epoch": 0.02149853708179621, "ewc_loss": 0.0015106201171875, "ewc_loss_diag": 3.948807716369629e-07, "ewc_loss_parallel": 1.1175870895385742e-06, "grad_norm": 5.982916831970215, "learning_rate": 7.121661721068249e-08, "loss": 0.7252, "mean_token_accuracy": 0.7885188460350037, "num_tokens": 6482199.0, "step": 169 }, { "epoch": 0.021625747360386718, "ewc_loss": 0.0015106201171875, "ewc_loss_diag": 3.9674341678619385e-07, "ewc_loss_parallel": 1.1101365089416504e-06, "grad_norm": 5.76764440536499, "learning_rate": 7.164052564646036e-08, "loss": 0.7054, "mean_token_accuracy": 0.796871542930603, "num_tokens": 6519697.0, "step": 170 }, { "epoch": 0.02175295763897723, "ewc_loss": 0.001495361328125, "ewc_loss_diag": 4.0605664253234863e-07, "ewc_loss_parallel": 1.087784767150879e-06, "grad_norm": 5.924586296081543, "learning_rate": 7.206443408223823e-08, "loss": 0.7665, "mean_token_accuracy": 0.7781064510345459, "num_tokens": 6554687.0, "step": 171 }, { "epoch": 0.02188016791756774, "ewc_loss": 0.001495361328125, "ewc_loss_diag": 4.079192876815796e-07, "ewc_loss_parallel": 1.087784767150879e-06, "grad_norm": 5.875988960266113, "learning_rate": 7.24883425180161e-08, "loss": 0.7294, "mean_token_accuracy": 0.7812451124191284, "num_tokens": 6593066.0, "step": 172 }, { "epoch": 0.02200737819615825, "ewc_loss": 0.00148773193359375, "ewc_loss_diag": 4.116445779800415e-07, "ewc_loss_parallel": 1.0728836059570312e-06, "grad_norm": 5.713994979858398, "learning_rate": 7.291225095379398e-08, "loss": 0.687, "mean_token_accuracy": 0.8010225296020508, "num_tokens": 6635484.0, "step": 173 }, { "epoch": 0.02213458847474876, "ewc_loss": 0.00147247314453125, "ewc_loss_diag": 4.1350722312927246e-07, "ewc_loss_parallel": 1.0579824447631836e-06, "grad_norm": 5.764242172241211, "learning_rate": 7.333615938957185e-08, "loss": 0.7796, "mean_token_accuracy": 0.774773120880127, "num_tokens": 6677090.0, "step": 174 }, { "epoch": 0.02226179875333927, "ewc_loss": 0.00146484375, "ewc_loss_diag": 4.153698682785034e-07, "ewc_loss_parallel": 1.0505318641662598e-06, "grad_norm": 5.871771812438965, "learning_rate": 7.376006782534971e-08, "loss": 0.6954, "mean_token_accuracy": 0.7957190275192261, "num_tokens": 6710250.0, "step": 175 }, { "epoch": 0.02238900903192978, "ewc_loss": 0.00146484375, "ewc_loss_diag": 4.172325134277344e-07, "ewc_loss_parallel": 1.043081283569336e-06, "grad_norm": 5.750709533691406, "learning_rate": 7.418397626112758e-08, "loss": 0.7211, "mean_token_accuracy": 0.7871600389480591, "num_tokens": 6752998.0, "step": 176 }, { "epoch": 0.02251621931052029, "ewc_loss": 0.00144195556640625, "ewc_loss_diag": 4.172325134277344e-07, "ewc_loss_parallel": 1.0207295417785645e-06, "grad_norm": 5.904139518737793, "learning_rate": 7.460788469690545e-08, "loss": 0.7227, "mean_token_accuracy": 0.7924880981445312, "num_tokens": 6789568.0, "step": 177 }, { "epoch": 0.0226434295891108, "ewc_loss": 0.0014495849609375, "ewc_loss_diag": 4.1909515857696533e-07, "ewc_loss_parallel": 1.0281801223754883e-06, "grad_norm": 6.138572692871094, "learning_rate": 7.503179313268333e-08, "loss": 0.7639, "mean_token_accuracy": 0.7764043211936951, "num_tokens": 6822810.0, "step": 178 }, { "epoch": 0.02277063986770131, "ewc_loss": 0.00145721435546875, "ewc_loss_diag": 4.2654573917388916e-07, "ewc_loss_parallel": 1.0281801223754883e-06, "grad_norm": 5.832324504852295, "learning_rate": 7.54557015684612e-08, "loss": 0.7028, "mean_token_accuracy": 0.7924889326095581, "num_tokens": 6861598.0, "step": 179 }, { "epoch": 0.02289785014629182, "ewc_loss": 0.00144195556640625, "ewc_loss_diag": 4.2654573917388916e-07, "ewc_loss_parallel": 1.0132789611816406e-06, "grad_norm": 5.775481224060059, "learning_rate": 7.587961000423907e-08, "loss": 0.7256, "mean_token_accuracy": 0.7895221710205078, "num_tokens": 6900220.0, "step": 180 }, { "epoch": 0.02302506042488233, "ewc_loss": 0.001434326171875, "ewc_loss_diag": 4.302710294723511e-07, "ewc_loss_parallel": 1.0058283805847168e-06, "grad_norm": 5.851550579071045, "learning_rate": 7.630351844001694e-08, "loss": 0.834, "mean_token_accuracy": 0.7590118646621704, "num_tokens": 6946983.0, "step": 181 }, { "epoch": 0.02315227070347284, "ewc_loss": 0.0014495849609375, "ewc_loss_diag": 4.3585896492004395e-07, "ewc_loss_parallel": 1.0132789611816406e-06, "grad_norm": 5.990159034729004, "learning_rate": 7.672742687579482e-08, "loss": 0.7395, "mean_token_accuracy": 0.7866032123565674, "num_tokens": 6986019.0, "step": 182 }, { "epoch": 0.02327948098206335, "ewc_loss": 0.00145721435546875, "ewc_loss_diag": 4.4330954551696777e-07, "ewc_loss_parallel": 1.0132789611816406e-06, "grad_norm": 5.97407865524292, "learning_rate": 7.715133531157269e-08, "loss": 0.7137, "mean_token_accuracy": 0.7864068746566772, "num_tokens": 7020722.0, "step": 183 }, { "epoch": 0.02340669126065386, "ewc_loss": 0.00147247314453125, "ewc_loss_diag": 4.4517219066619873e-07, "ewc_loss_parallel": 1.0281801223754883e-06, "grad_norm": 5.904985427856445, "learning_rate": 7.757524374735056e-08, "loss": 0.7851, "mean_token_accuracy": 0.7729376554489136, "num_tokens": 7062691.0, "step": 184 }, { "epoch": 0.02353390153924437, "ewc_loss": 0.0014801025390625, "ewc_loss_diag": 4.5262277126312256e-07, "ewc_loss_parallel": 1.0281801223754883e-06, "grad_norm": 5.889181613922119, "learning_rate": 7.799915218312844e-08, "loss": 0.7328, "mean_token_accuracy": 0.7875773906707764, "num_tokens": 7101090.0, "step": 185 }, { "epoch": 0.02366111181783488, "ewc_loss": 0.00148773193359375, "ewc_loss_diag": 4.5262277126312256e-07, "ewc_loss_parallel": 1.0356307029724121e-06, "grad_norm": 5.8867363929748535, "learning_rate": 7.842306061890631e-08, "loss": 0.7062, "mean_token_accuracy": 0.7889813780784607, "num_tokens": 7139891.0, "step": 186 }, { "epoch": 0.023788322096425393, "ewc_loss": 0.001495361328125, "ewc_loss_diag": 4.544854164123535e-07, "ewc_loss_parallel": 1.043081283569336e-06, "grad_norm": 5.9247565269470215, "learning_rate": 7.88469690546842e-08, "loss": 0.7549, "mean_token_accuracy": 0.7798112034797668, "num_tokens": 7179501.0, "step": 187 }, { "epoch": 0.0239155323750159, "ewc_loss": 0.00150299072265625, "ewc_loss_diag": 4.600733518600464e-07, "ewc_loss_parallel": 1.043081283569336e-06, "grad_norm": 6.0276970863342285, "learning_rate": 7.927087749046207e-08, "loss": 0.7246, "mean_token_accuracy": 0.7869460582733154, "num_tokens": 7213809.0, "step": 188 }, { "epoch": 0.02404274265360641, "ewc_loss": 0.00151824951171875, "ewc_loss_diag": 4.675239324569702e-07, "ewc_loss_parallel": 1.0505318641662598e-06, "grad_norm": 5.963469982147217, "learning_rate": 7.969478592623994e-08, "loss": 0.8, "mean_token_accuracy": 0.7653466463088989, "num_tokens": 7254493.0, "step": 189 }, { "epoch": 0.024169952932196922, "ewc_loss": 0.0015411376953125, "ewc_loss_diag": 4.7124922275543213e-07, "ewc_loss_parallel": 1.0654330253601074e-06, "grad_norm": 5.930583477020264, "learning_rate": 8.011869436201781e-08, "loss": 0.6579, "mean_token_accuracy": 0.8068197965621948, "num_tokens": 7292643.0, "step": 190 }, { "epoch": 0.024297163210787433, "ewc_loss": 0.00154876708984375, "ewc_loss_diag": 4.7497451305389404e-07, "ewc_loss_parallel": 1.0728836059570312e-06, "grad_norm": 6.001421928405762, "learning_rate": 8.054260279779568e-08, "loss": 0.7822, "mean_token_accuracy": 0.7746102809906006, "num_tokens": 7329810.0, "step": 191 }, { "epoch": 0.02442437348937794, "ewc_loss": 0.0015869140625, "ewc_loss_diag": 4.991888999938965e-07, "ewc_loss_parallel": 1.080334186553955e-06, "grad_norm": 9.356985092163086, "learning_rate": 8.096651123357356e-08, "loss": 0.7677, "mean_token_accuracy": 0.7763479948043823, "num_tokens": 7367630.0, "step": 192 }, { "epoch": 0.02455158376796845, "ewc_loss": 0.001617431640625, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 1.1473894119262695e-06, "grad_norm": 6.2287092208862305, "learning_rate": 8.139041966935143e-08, "loss": 0.749, "mean_token_accuracy": 0.780315637588501, "num_tokens": 7411580.0, "step": 193 }, { "epoch": 0.024678794046558962, "ewc_loss": 0.001617431640625, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 1.1399388313293457e-06, "grad_norm": 5.896068096160889, "learning_rate": 8.181432810512929e-08, "loss": 0.7046, "mean_token_accuracy": 0.7935051918029785, "num_tokens": 7451631.0, "step": 194 }, { "epoch": 0.024806004325149473, "ewc_loss": 0.0016021728515625, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 1.125037670135498e-06, "grad_norm": 5.868780136108398, "learning_rate": 8.223823654090716e-08, "loss": 0.7135, "mean_token_accuracy": 0.7896631360054016, "num_tokens": 7493645.0, "step": 195 }, { "epoch": 0.024933214603739984, "ewc_loss": 0.0016021728515625, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 1.125037670135498e-06, "grad_norm": 5.9279279708862305, "learning_rate": 8.266214497668503e-08, "loss": 0.674, "mean_token_accuracy": 0.8035737872123718, "num_tokens": 7538042.0, "step": 196 }, { "epoch": 0.02506042488233049, "ewc_loss": 0.001617431640625, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 1.1399388313293457e-06, "grad_norm": 6.05640172958374, "learning_rate": 8.30860534124629e-08, "loss": 0.7384, "mean_token_accuracy": 0.7839030027389526, "num_tokens": 7575374.0, "step": 197 }, { "epoch": 0.025187635160921002, "ewc_loss": 0.00164794921875, "ewc_loss_diag": 4.842877388000488e-07, "ewc_loss_parallel": 1.169741153717041e-06, "grad_norm": 6.488101959228516, "learning_rate": 8.350996184824078e-08, "loss": 0.7326, "mean_token_accuracy": 0.7837351560592651, "num_tokens": 7606880.0, "step": 198 }, { "epoch": 0.025314845439511513, "ewc_loss": 0.001678466796875, "ewc_loss_diag": 4.880130290985107e-07, "ewc_loss_parallel": 1.1920928955078125e-06, "grad_norm": 6.1699018478393555, "learning_rate": 8.393387028401865e-08, "loss": 0.7094, "mean_token_accuracy": 0.7923563122749329, "num_tokens": 7644840.0, "step": 199 }, { "epoch": 0.025442055718102024, "ewc_loss": 0.0016937255859375, "ewc_loss_diag": 4.880130290985107e-07, "ewc_loss_parallel": 1.2069940567016602e-06, "grad_norm": 6.093146800994873, "learning_rate": 8.435777871979652e-08, "loss": 0.7496, "mean_token_accuracy": 0.7788117527961731, "num_tokens": 7683856.0, "step": 200 }, { "epoch": 0.02556926599669253, "ewc_loss": 0.0017242431640625, "ewc_loss_diag": 4.991888999938965e-07, "ewc_loss_parallel": 1.2218952178955078e-06, "grad_norm": 6.321500301361084, "learning_rate": 8.47816871555744e-08, "loss": 0.7102, "mean_token_accuracy": 0.7912455797195435, "num_tokens": 7715306.0, "step": 201 }, { "epoch": 0.025696476275283042, "ewc_loss": 0.0017547607421875, "ewc_loss_diag": 5.066394805908203e-07, "ewc_loss_parallel": 1.2442469596862793e-06, "grad_norm": 6.32210636138916, "learning_rate": 8.520559559135227e-08, "loss": 0.669, "mean_token_accuracy": 0.8022521138191223, "num_tokens": 7752442.0, "step": 202 }, { "epoch": 0.025823686553873553, "ewc_loss": 0.0017852783203125, "ewc_loss_diag": 5.140900611877441e-07, "ewc_loss_parallel": 1.2665987014770508e-06, "grad_norm": 6.16402530670166, "learning_rate": 8.562950402713014e-08, "loss": 0.7527, "mean_token_accuracy": 0.7829238772392273, "num_tokens": 7799891.0, "step": 203 }, { "epoch": 0.025950896832464064, "ewc_loss": 0.001800537109375, "ewc_loss_diag": 5.140900611877441e-07, "ewc_loss_parallel": 1.2889504432678223e-06, "grad_norm": 6.365159034729004, "learning_rate": 8.605341246290801e-08, "loss": 0.7448, "mean_token_accuracy": 0.7801436185836792, "num_tokens": 7830696.0, "step": 204 }, { "epoch": 0.026078107111054575, "ewc_loss": 0.001861572265625, "ewc_loss_diag": 5.289912223815918e-07, "ewc_loss_parallel": 1.3336539268493652e-06, "grad_norm": 6.246297359466553, "learning_rate": 8.647732089868589e-08, "loss": 0.7134, "mean_token_accuracy": 0.7925214767456055, "num_tokens": 7870968.0, "step": 205 }, { "epoch": 0.026205317389645082, "ewc_loss": 0.00189971923828125, "ewc_loss_diag": 5.401670932769775e-07, "ewc_loss_parallel": 1.3560056686401367e-06, "grad_norm": 6.262171268463135, "learning_rate": 8.690122933446376e-08, "loss": 0.7367, "mean_token_accuracy": 0.7854197025299072, "num_tokens": 7914418.0, "step": 206 }, { "epoch": 0.026332527668235593, "ewc_loss": 0.001953125, "ewc_loss_diag": 5.476176738739014e-07, "ewc_loss_parallel": 1.4081597328186035e-06, "grad_norm": 6.21505069732666, "learning_rate": 8.732513777024163e-08, "loss": 0.6748, "mean_token_accuracy": 0.8061736822128296, "num_tokens": 7954544.0, "step": 207 }, { "epoch": 0.026459737946826104, "ewc_loss": 0.0019683837890625, "ewc_loss_diag": 5.476176738739014e-07, "ewc_loss_parallel": 1.4156103134155273e-06, "grad_norm": 6.369027614593506, "learning_rate": 8.77490462060195e-08, "loss": 0.724, "mean_token_accuracy": 0.7910237312316895, "num_tokens": 7997373.0, "step": 208 }, { "epoch": 0.026586948225416615, "ewc_loss": 0.0019989013671875, "ewc_loss_diag": 5.550682544708252e-07, "ewc_loss_parallel": 1.4454126358032227e-06, "grad_norm": 7.22798490524292, "learning_rate": 8.817295464179738e-08, "loss": 0.6898, "mean_token_accuracy": 0.7931194305419922, "num_tokens": 8033314.0, "step": 209 }, { "epoch": 0.026714158504007122, "ewc_loss": 0.002044677734375, "ewc_loss_diag": 5.587935447692871e-07, "ewc_loss_parallel": 1.4901161193847656e-06, "grad_norm": 6.662779331207275, "learning_rate": 8.859686307757525e-08, "loss": 0.7069, "mean_token_accuracy": 0.7926983833312988, "num_tokens": 8069362.0, "step": 210 }, { "epoch": 0.026841368782597633, "ewc_loss": 0.0020751953125, "ewc_loss_diag": 5.662441253662109e-07, "ewc_loss_parallel": 1.519918441772461e-06, "grad_norm": 6.716729164123535, "learning_rate": 8.902077151335312e-08, "loss": 0.7404, "mean_token_accuracy": 0.7806327939033508, "num_tokens": 8109126.0, "step": 211 }, { "epoch": 0.026968579061188144, "ewc_loss": 0.0021209716796875, "ewc_loss_diag": 5.811452865600586e-07, "ewc_loss_parallel": 1.5422701835632324e-06, "grad_norm": 7.349218845367432, "learning_rate": 8.944467994913098e-08, "loss": 0.796, "mean_token_accuracy": 0.7718111276626587, "num_tokens": 8143083.0, "step": 212 }, { "epoch": 0.027095789339778655, "ewc_loss": 0.002166748046875, "ewc_loss_diag": 5.885958671569824e-07, "ewc_loss_parallel": 1.5795230865478516e-06, "grad_norm": 6.870129108428955, "learning_rate": 8.986858838490885e-08, "loss": 0.6437, "mean_token_accuracy": 0.8053449392318726, "num_tokens": 8176636.0, "step": 213 }, { "epoch": 0.027222999618369163, "ewc_loss": 0.002197265625, "ewc_loss_diag": 5.885958671569824e-07, "ewc_loss_parallel": 1.601874828338623e-06, "grad_norm": 6.424748420715332, "learning_rate": 9.029249682068673e-08, "loss": 0.702, "mean_token_accuracy": 0.7916780710220337, "num_tokens": 8220262.0, "step": 214 }, { "epoch": 0.027350209896959674, "ewc_loss": 0.002197265625, "ewc_loss_diag": 5.923211574554443e-07, "ewc_loss_parallel": 1.6093254089355469e-06, "grad_norm": 6.604091644287109, "learning_rate": 9.07164052564646e-08, "loss": 0.7227, "mean_token_accuracy": 0.7894636392593384, "num_tokens": 8256303.0, "step": 215 }, { "epoch": 0.027477420175550184, "ewc_loss": 0.0022430419921875, "ewc_loss_diag": 5.997717380523682e-07, "ewc_loss_parallel": 1.6391277313232422e-06, "grad_norm": 7.204063415527344, "learning_rate": 9.114031369224247e-08, "loss": 0.6861, "mean_token_accuracy": 0.7992646098136902, "num_tokens": 8293888.0, "step": 216 }, { "epoch": 0.027604630454140695, "ewc_loss": 0.002288818359375, "ewc_loss_diag": 5.997717380523682e-07, "ewc_loss_parallel": 1.6838312149047852e-06, "grad_norm": 7.329031467437744, "learning_rate": 9.156422212802034e-08, "loss": 0.6479, "mean_token_accuracy": 0.8078166246414185, "num_tokens": 8335368.0, "step": 217 }, { "epoch": 0.027731840732731206, "ewc_loss": 0.0023345947265625, "ewc_loss_diag": 6.07222318649292e-07, "ewc_loss_parallel": 1.7210841178894043e-06, "grad_norm": 6.723156929016113, "learning_rate": 9.198813056379822e-08, "loss": 0.704, "mean_token_accuracy": 0.7950571775436401, "num_tokens": 8370338.0, "step": 218 }, { "epoch": 0.027859051011321714, "ewc_loss": 0.0023345947265625, "ewc_loss_diag": 6.109476089477539e-07, "ewc_loss_parallel": 1.7210841178894043e-06, "grad_norm": 6.518057346343994, "learning_rate": 9.241203899957609e-08, "loss": 0.6882, "mean_token_accuracy": 0.7963957786560059, "num_tokens": 8409904.0, "step": 219 }, { "epoch": 0.027986261289912225, "ewc_loss": 0.0023651123046875, "ewc_loss_diag": 6.295740604400635e-07, "ewc_loss_parallel": 1.7285346984863281e-06, "grad_norm": 6.911295413970947, "learning_rate": 9.283594743535396e-08, "loss": 0.8159, "mean_token_accuracy": 0.7632074356079102, "num_tokens": 8446579.0, "step": 220 }, { "epoch": 0.028113471568502735, "ewc_loss": 0.00238037109375, "ewc_loss_diag": 6.332993507385254e-07, "ewc_loss_parallel": 1.7583370208740234e-06, "grad_norm": 6.674582481384277, "learning_rate": 9.325985587113183e-08, "loss": 0.6614, "mean_token_accuracy": 0.8055092692375183, "num_tokens": 8483663.0, "step": 221 }, { "epoch": 0.028240681847093246, "ewc_loss": 0.00238037109375, "ewc_loss_diag": 6.332993507385254e-07, "ewc_loss_parallel": 1.7583370208740234e-06, "grad_norm": 6.90446138381958, "learning_rate": 9.368376430690971e-08, "loss": 0.6945, "mean_token_accuracy": 0.7940027117729187, "num_tokens": 8516346.0, "step": 222 }, { "epoch": 0.028367892125683754, "ewc_loss": 0.00250244140625, "ewc_loss_diag": 6.966292858123779e-07, "ewc_loss_parallel": 1.8030405044555664e-06, "grad_norm": 9.815899848937988, "learning_rate": 9.410767274268758e-08, "loss": 0.6889, "mean_token_accuracy": 0.7985458970069885, "num_tokens": 8556914.0, "step": 223 }, { "epoch": 0.028495102404274265, "ewc_loss": 0.0025787353515625, "ewc_loss_diag": 6.444752216339111e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.6474761962890625, "learning_rate": 9.453158117846545e-08, "loss": 0.6766, "mean_token_accuracy": 0.796851396560669, "num_tokens": 8596548.0, "step": 224 }, { "epoch": 0.028622312682864776, "ewc_loss": 0.0025634765625, "ewc_loss_diag": 6.48200511932373e-07, "ewc_loss_parallel": 1.9222497940063477e-06, "grad_norm": 7.158196926116943, "learning_rate": 9.495548961424333e-08, "loss": 0.7081, "mean_token_accuracy": 0.794617772102356, "num_tokens": 8634366.0, "step": 225 }, { "epoch": 0.028749522961455286, "ewc_loss": 0.0025482177734375, "ewc_loss_diag": 6.51925802230835e-07, "ewc_loss_parallel": 1.8998980522155762e-06, "grad_norm": 6.650304794311523, "learning_rate": 9.53793980500212e-08, "loss": 0.648, "mean_token_accuracy": 0.8070992827415466, "num_tokens": 8666480.0, "step": 226 }, { "epoch": 0.028876733240045797, "ewc_loss": 0.0025177001953125, "ewc_loss_diag": 6.556510925292969e-07, "ewc_loss_parallel": 1.862645149230957e-06, "grad_norm": 7.407574653625488, "learning_rate": 9.580330648579907e-08, "loss": 0.7975, "mean_token_accuracy": 0.7708038091659546, "num_tokens": 8705880.0, "step": 227 }, { "epoch": 0.029003943518636305, "ewc_loss": 0.0025482177734375, "ewc_loss_diag": 6.556510925292969e-07, "ewc_loss_parallel": 1.8924474716186523e-06, "grad_norm": 6.726161956787109, "learning_rate": 9.622721492157694e-08, "loss": 0.6594, "mean_token_accuracy": 0.8044995665550232, "num_tokens": 8743772.0, "step": 228 }, { "epoch": 0.029131153797226816, "ewc_loss": 0.0025634765625, "ewc_loss_diag": 6.593763828277588e-07, "ewc_loss_parallel": 1.8998980522155762e-06, "grad_norm": 7.068267822265625, "learning_rate": 9.665112335735482e-08, "loss": 0.685, "mean_token_accuracy": 0.7931528091430664, "num_tokens": 8775919.0, "step": 229 }, { "epoch": 0.029258364075817327, "ewc_loss": 0.0025634765625, "ewc_loss_diag": 6.631016731262207e-07, "ewc_loss_parallel": 1.9073486328125e-06, "grad_norm": 6.947673320770264, "learning_rate": 9.707503179313267e-08, "loss": 0.6317, "mean_token_accuracy": 0.8100448846817017, "num_tokens": 8809336.0, "step": 230 }, { "epoch": 0.029385574354407838, "ewc_loss": 0.0025787353515625, "ewc_loss_diag": 6.705522537231445e-07, "ewc_loss_parallel": 1.9073486328125e-06, "grad_norm": 7.532566547393799, "learning_rate": 9.749894022891055e-08, "loss": 0.6831, "mean_token_accuracy": 0.796868622303009, "num_tokens": 8845952.0, "step": 231 }, { "epoch": 0.029512784632998345, "ewc_loss": 0.00262451171875, "ewc_loss_diag": 6.780028343200684e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.198674201965332, "learning_rate": 9.792284866468842e-08, "loss": 0.7037, "mean_token_accuracy": 0.7878479957580566, "num_tokens": 8889801.0, "step": 232 }, { "epoch": 0.029639994911588856, "ewc_loss": 0.00262451171875, "ewc_loss_diag": 6.780028343200684e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.262153625488281, "learning_rate": 9.834675710046629e-08, "loss": 0.6633, "mean_token_accuracy": 0.7990002036094666, "num_tokens": 8925429.0, "step": 233 }, { "epoch": 0.029767205190179367, "ewc_loss": 0.00262451171875, "ewc_loss_diag": 6.854534149169922e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 6.849272727966309, "learning_rate": 9.877066553624416e-08, "loss": 0.6946, "mean_token_accuracy": 0.7921336889266968, "num_tokens": 8963360.0, "step": 234 }, { "epoch": 0.029894415468769878, "ewc_loss": 0.0026397705078125, "ewc_loss_diag": 6.966292858123779e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 6.771679401397705, "learning_rate": 9.919457397202204e-08, "loss": 0.7075, "mean_token_accuracy": 0.7928781509399414, "num_tokens": 8998314.0, "step": 235 }, { "epoch": 0.030021625747360385, "ewc_loss": 0.0026397705078125, "ewc_loss_diag": 6.966292858123779e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.477994918823242, "learning_rate": 9.961848240779991e-08, "loss": 0.778, "mean_token_accuracy": 0.7728638648986816, "num_tokens": 9034804.0, "step": 236 }, { "epoch": 0.030148836025950896, "ewc_loss": 0.002655029296875, "ewc_loss_diag": 7.040798664093018e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.786286354064941, "learning_rate": 1.0004239084357778e-07, "loss": 0.7005, "mean_token_accuracy": 0.7932915687561035, "num_tokens": 9070840.0, "step": 237 }, { "epoch": 0.030276046304541407, "ewc_loss": 0.0027008056640625, "ewc_loss_diag": 7.338821887969971e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 8.583678245544434, "learning_rate": 1.0046629927935566e-07, "loss": 0.6808, "mean_token_accuracy": 0.7996270656585693, "num_tokens": 9109370.0, "step": 238 }, { "epoch": 0.030403256583131918, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 7.152557373046875e-07, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 6.868593692779541, "learning_rate": 1.0089020771513353e-07, "loss": 0.6223, "mean_token_accuracy": 0.8139910697937012, "num_tokens": 9148191.0, "step": 239 }, { "epoch": 0.03053046686172243, "ewc_loss": 0.0026702880859375, "ewc_loss_diag": 7.227063179016113e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 6.968650817871094, "learning_rate": 1.013141161509114e-07, "loss": 0.6902, "mean_token_accuracy": 0.7958415746688843, "num_tokens": 9182923.0, "step": 240 }, { "epoch": 0.030657677140312936, "ewc_loss": 0.002685546875, "ewc_loss_diag": 7.301568984985352e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.741396427154541, "learning_rate": 1.0173802458668927e-07, "loss": 0.6908, "mean_token_accuracy": 0.7928800582885742, "num_tokens": 9215882.0, "step": 241 }, { "epoch": 0.030784887418903447, "ewc_loss": 0.002685546875, "ewc_loss_diag": 7.301568984985352e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.089046478271484, "learning_rate": 1.0216193302246715e-07, "loss": 0.7225, "mean_token_accuracy": 0.7817505598068237, "num_tokens": 9249718.0, "step": 242 }, { "epoch": 0.030912097697493958, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 7.413327693939209e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 7.34392786026001, "learning_rate": 1.0258584145824502e-07, "loss": 0.6418, "mean_token_accuracy": 0.8090612888336182, "num_tokens": 9291146.0, "step": 243 }, { "epoch": 0.03103930797608447, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 7.413327693939209e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 7.468616008758545, "learning_rate": 1.0300974989402289e-07, "loss": 0.6364, "mean_token_accuracy": 0.8090356588363647, "num_tokens": 9326403.0, "step": 244 }, { "epoch": 0.031166518254674976, "ewc_loss": 0.0027008056640625, "ewc_loss_diag": 7.487833499908447e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 6.874899387359619, "learning_rate": 1.0343365832980076e-07, "loss": 0.6884, "mean_token_accuracy": 0.795189380645752, "num_tokens": 9368491.0, "step": 245 }, { "epoch": 0.03129372853326549, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 7.599592208862305e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 6.909482002258301, "learning_rate": 1.0385756676557864e-07, "loss": 0.6848, "mean_token_accuracy": 0.796924889087677, "num_tokens": 9409041.0, "step": 246 }, { "epoch": 0.031420938811855995, "ewc_loss": 0.0027008056640625, "ewc_loss_diag": 7.674098014831543e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.434316158294678, "learning_rate": 1.0428147520135651e-07, "loss": 0.6566, "mean_token_accuracy": 0.8043755292892456, "num_tokens": 9448591.0, "step": 247 }, { "epoch": 0.03154814909044651, "ewc_loss": 0.0027313232421875, "ewc_loss_diag": 7.748603820800781e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.008411884307861, "learning_rate": 1.0470538363713437e-07, "loss": 0.6875, "mean_token_accuracy": 0.7940206527709961, "num_tokens": 9486604.0, "step": 248 }, { "epoch": 0.031675359369037016, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 7.897615432739258e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.462218761444092, "learning_rate": 1.0512929207291224e-07, "loss": 0.6937, "mean_token_accuracy": 0.7891923785209656, "num_tokens": 9525285.0, "step": 249 }, { "epoch": 0.03180256964762753, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 7.897615432739258e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.986283302307129, "learning_rate": 1.0555320050869011e-07, "loss": 0.6732, "mean_token_accuracy": 0.7947782278060913, "num_tokens": 9557898.0, "step": 250 }, { "epoch": 0.03192977992621804, "ewc_loss": 0.0027923583984375, "ewc_loss_diag": 7.934868335723877e-07, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 7.458965301513672, "learning_rate": 1.0597710894446799e-07, "loss": 0.7347, "mean_token_accuracy": 0.7781883478164673, "num_tokens": 9590438.0, "step": 251 }, { "epoch": 0.032056990204808546, "ewc_loss": 0.0027618408203125, "ewc_loss_diag": 7.972121238708496e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 7.030106067657471, "learning_rate": 1.0640101738024586e-07, "loss": 0.6415, "mean_token_accuracy": 0.8078725934028625, "num_tokens": 9631402.0, "step": 252 }, { "epoch": 0.03218420048339906, "ewc_loss": 0.0027313232421875, "ewc_loss_diag": 7.972121238708496e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 9.252415657043457, "learning_rate": 1.0682492581602373e-07, "loss": 0.625, "mean_token_accuracy": 0.8133389353752136, "num_tokens": 9663702.0, "step": 253 }, { "epoch": 0.03231141076198957, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 8.195638656616211e-07, "ewc_loss_parallel": 1.9818544387817383e-06, "grad_norm": 8.114200592041016, "learning_rate": 1.072488342518016e-07, "loss": 0.6123, "mean_token_accuracy": 0.8127552270889282, "num_tokens": 9698969.0, "step": 254 }, { "epoch": 0.03243862104058008, "ewc_loss": 0.0027923583984375, "ewc_loss_diag": 8.121132850646973e-07, "ewc_loss_parallel": 1.9818544387817383e-06, "grad_norm": 7.351579666137695, "learning_rate": 1.0767274268757948e-07, "loss": 0.6818, "mean_token_accuracy": 0.794392466545105, "num_tokens": 9738075.0, "step": 255 }, { "epoch": 0.03256583131917059, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 8.121132850646973e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.619417667388916, "learning_rate": 1.0809665112335735e-07, "loss": 0.6858, "mean_token_accuracy": 0.796807050704956, "num_tokens": 9782675.0, "step": 256 }, { "epoch": 0.0326930415977611, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 8.158385753631592e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.343493461608887, "learning_rate": 1.0852055955913522e-07, "loss": 0.5855, "mean_token_accuracy": 0.8235076665878296, "num_tokens": 9820585.0, "step": 257 }, { "epoch": 0.03282025187635161, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 8.23289155960083e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.0139799118042, "learning_rate": 1.089444679949131e-07, "loss": 0.7115, "mean_token_accuracy": 0.7894427180290222, "num_tokens": 9860693.0, "step": 258 }, { "epoch": 0.03294746215494212, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 8.381903171539307e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 7.776782035827637, "learning_rate": 1.0936837643069097e-07, "loss": 0.6438, "mean_token_accuracy": 0.807050883769989, "num_tokens": 9902629.0, "step": 259 }, { "epoch": 0.03307467243353263, "ewc_loss": 0.0027923583984375, "ewc_loss_diag": 8.381903171539307e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.199270248413086, "learning_rate": 1.0979228486646884e-07, "loss": 0.6163, "mean_token_accuracy": 0.8123204112052917, "num_tokens": 9937304.0, "step": 260 }, { "epoch": 0.03320188271212314, "ewc_loss": 0.0027618408203125, "ewc_loss_diag": 8.419156074523926e-07, "ewc_loss_parallel": 1.9222497940063477e-06, "grad_norm": 7.438126087188721, "learning_rate": 1.1021619330224671e-07, "loss": 0.7491, "mean_token_accuracy": 0.7775275111198425, "num_tokens": 9976911.0, "step": 261 }, { "epoch": 0.03332909299071365, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 8.456408977508545e-07, "ewc_loss_parallel": 1.8998980522155762e-06, "grad_norm": 7.534211158752441, "learning_rate": 1.1064010173802458e-07, "loss": 0.6833, "mean_token_accuracy": 0.79674232006073, "num_tokens": 10015740.0, "step": 262 }, { "epoch": 0.03345630326930416, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 8.493661880493164e-07, "ewc_loss_parallel": 1.8998980522155762e-06, "grad_norm": 8.45260238647461, "learning_rate": 1.1106401017380246e-07, "loss": 0.6214, "mean_token_accuracy": 0.8148287534713745, "num_tokens": 10053054.0, "step": 263 }, { "epoch": 0.03358351354789467, "ewc_loss": 0.0028228759765625, "ewc_loss_diag": 8.530914783477783e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 9.535659790039062, "learning_rate": 1.1148791860958033e-07, "loss": 0.7206, "mean_token_accuracy": 0.7811868786811829, "num_tokens": 10082088.0, "step": 264 }, { "epoch": 0.03371072382648518, "ewc_loss": 0.0028533935546875, "ewc_loss_diag": 8.530914783477783e-07, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 9.7388277053833, "learning_rate": 1.119118270453582e-07, "loss": 0.6911, "mean_token_accuracy": 0.7931672930717468, "num_tokens": 10121122.0, "step": 265 }, { "epoch": 0.03383793410507569, "ewc_loss": 0.00286865234375, "ewc_loss_diag": 8.642673492431641e-07, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 7.711782932281494, "learning_rate": 1.1233573548113607e-07, "loss": 0.6842, "mean_token_accuracy": 0.7964777946472168, "num_tokens": 10161017.0, "step": 266 }, { "epoch": 0.0339651443836662, "ewc_loss": 0.0027923583984375, "ewc_loss_diag": 8.754432201385498e-07, "ewc_loss_parallel": 1.9222497940063477e-06, "grad_norm": 7.6315693855285645, "learning_rate": 1.1275964391691393e-07, "loss": 0.6493, "mean_token_accuracy": 0.8062055706977844, "num_tokens": 10204913.0, "step": 267 }, { "epoch": 0.03409235466225671, "ewc_loss": 0.002685546875, "ewc_loss_diag": 8.754432201385498e-07, "ewc_loss_parallel": 1.817941665649414e-06, "grad_norm": 8.19703483581543, "learning_rate": 1.131835523526918e-07, "loss": 0.698, "mean_token_accuracy": 0.7887712717056274, "num_tokens": 10245154.0, "step": 268 }, { "epoch": 0.03421956494084722, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 9.275972843170166e-07, "ewc_loss_parallel": 1.8551945686340332e-06, "grad_norm": 10.371549606323242, "learning_rate": 1.1360746078846968e-07, "loss": 0.705, "mean_token_accuracy": 0.7890752553939819, "num_tokens": 10283006.0, "step": 269 }, { "epoch": 0.03434677521943773, "ewc_loss": 0.00286865234375, "ewc_loss_diag": 8.791685104370117e-07, "ewc_loss_parallel": 1.9818544387817383e-06, "grad_norm": 8.640369415283203, "learning_rate": 1.1403136922424755e-07, "loss": 0.6665, "mean_token_accuracy": 0.7995623350143433, "num_tokens": 10316689.0, "step": 270 }, { "epoch": 0.03447398549802824, "ewc_loss": 0.0028228759765625, "ewc_loss_diag": 8.828938007354736e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 7.817602157592773, "learning_rate": 1.1445527766002542e-07, "loss": 0.659, "mean_token_accuracy": 0.8000513911247253, "num_tokens": 10358145.0, "step": 271 }, { "epoch": 0.03460119577661875, "ewc_loss": 0.002685546875, "ewc_loss_diag": 8.791685104370117e-07, "ewc_loss_parallel": 1.8104910850524902e-06, "grad_norm": 9.1510648727417, "learning_rate": 1.148791860958033e-07, "loss": 0.7613, "mean_token_accuracy": 0.7743079662322998, "num_tokens": 10393424.0, "step": 272 }, { "epoch": 0.034728406055209264, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 8.828938007354736e-07, "ewc_loss_parallel": 1.8402934074401855e-06, "grad_norm": 7.2625932693481445, "learning_rate": 1.1530309453158117e-07, "loss": 0.6743, "mean_token_accuracy": 0.7971731424331665, "num_tokens": 10433560.0, "step": 273 }, { "epoch": 0.03485561633379977, "ewc_loss": 0.0026702880859375, "ewc_loss_diag": 8.791685104370117e-07, "ewc_loss_parallel": 1.7955899238586426e-06, "grad_norm": 8.034065246582031, "learning_rate": 1.1572700296735904e-07, "loss": 0.6715, "mean_token_accuracy": 0.7975937128067017, "num_tokens": 10472024.0, "step": 274 }, { "epoch": 0.03498282661239028, "ewc_loss": 0.002685546875, "ewc_loss_diag": 8.791685104370117e-07, "ewc_loss_parallel": 1.8030405044555664e-06, "grad_norm": 8.144327163696289, "learning_rate": 1.1615091140313691e-07, "loss": 0.7109, "mean_token_accuracy": 0.7875083684921265, "num_tokens": 10510874.0, "step": 275 }, { "epoch": 0.03511003689098079, "ewc_loss": 0.0027008056640625, "ewc_loss_diag": 8.866190910339355e-07, "ewc_loss_parallel": 1.817941665649414e-06, "grad_norm": 7.499880790710449, "learning_rate": 1.1657481983891479e-07, "loss": 0.6475, "mean_token_accuracy": 0.8058211803436279, "num_tokens": 10552411.0, "step": 276 }, { "epoch": 0.0352372471695713, "ewc_loss": 0.002685546875, "ewc_loss_diag": 8.940696716308594e-07, "ewc_loss_parallel": 1.7955899238586426e-06, "grad_norm": 9.182597160339355, "learning_rate": 1.1699872827469266e-07, "loss": 0.665, "mean_token_accuracy": 0.8020536303520203, "num_tokens": 10591537.0, "step": 277 }, { "epoch": 0.03536445744816181, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 9.015202522277832e-07, "ewc_loss_parallel": 1.8700957298278809e-06, "grad_norm": 8.01163387298584, "learning_rate": 1.1742263671047053e-07, "loss": 0.6729, "mean_token_accuracy": 0.7986915111541748, "num_tokens": 10632228.0, "step": 278 }, { "epoch": 0.03549166772675232, "ewc_loss": 0.0027313232421875, "ewc_loss_diag": 9.052455425262451e-07, "ewc_loss_parallel": 1.8328428268432617e-06, "grad_norm": 8.57589054107666, "learning_rate": 1.178465451462484e-07, "loss": 0.7282, "mean_token_accuracy": 0.7804428935050964, "num_tokens": 10672829.0, "step": 279 }, { "epoch": 0.03561887800534283, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 9.015202522277832e-07, "ewc_loss_parallel": 1.817941665649414e-06, "grad_norm": 9.468282699584961, "learning_rate": 1.1827045358202628e-07, "loss": 0.703, "mean_token_accuracy": 0.795008659362793, "num_tokens": 10706146.0, "step": 280 }, { "epoch": 0.035746088283933344, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 9.164214134216309e-07, "ewc_loss_parallel": 1.862645149230957e-06, "grad_norm": 7.976539611816406, "learning_rate": 1.1869436201780415e-07, "loss": 0.6186, "mean_token_accuracy": 0.8150674104690552, "num_tokens": 10744112.0, "step": 281 }, { "epoch": 0.03587329856252385, "ewc_loss": 0.002716064453125, "ewc_loss_diag": 9.126961231231689e-07, "ewc_loss_parallel": 1.8104910850524902e-06, "grad_norm": 7.865006446838379, "learning_rate": 1.1911827045358202e-07, "loss": 0.6501, "mean_token_accuracy": 0.8101885318756104, "num_tokens": 10783726.0, "step": 282 }, { "epoch": 0.03600050884111436, "ewc_loss": 0.0027008056640625, "ewc_loss_diag": 9.164214134216309e-07, "ewc_loss_parallel": 1.7881393432617188e-06, "grad_norm": 7.55610466003418, "learning_rate": 1.195421788893599e-07, "loss": 0.714, "mean_token_accuracy": 0.7891660928726196, "num_tokens": 10824476.0, "step": 283 }, { "epoch": 0.036127719119704874, "ewc_loss": 0.002685546875, "ewc_loss_diag": 9.201467037200928e-07, "ewc_loss_parallel": 1.773238182067871e-06, "grad_norm": 8.442386627197266, "learning_rate": 1.1996608732513778e-07, "loss": 0.6751, "mean_token_accuracy": 0.7979835271835327, "num_tokens": 10859997.0, "step": 284 }, { "epoch": 0.03625492939829538, "ewc_loss": 0.0027618408203125, "ewc_loss_diag": 9.313225746154785e-07, "ewc_loss_parallel": 1.8328428268432617e-06, "grad_norm": 8.113980293273926, "learning_rate": 1.2038999576091563e-07, "loss": 0.6451, "mean_token_accuracy": 0.8076297044754028, "num_tokens": 10898800.0, "step": 285 }, { "epoch": 0.036382139676885895, "ewc_loss": 0.0027618408203125, "ewc_loss_diag": 9.313225746154785e-07, "ewc_loss_parallel": 1.8328428268432617e-06, "grad_norm": 8.15924072265625, "learning_rate": 1.208139041966935e-07, "loss": 0.6103, "mean_token_accuracy": 0.8178250193595886, "num_tokens": 10937462.0, "step": 286 }, { "epoch": 0.0365093499554764, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 9.5367431640625e-07, "ewc_loss_parallel": 1.8253922462463379e-06, "grad_norm": 8.514077186584473, "learning_rate": 1.2123781263247137e-07, "loss": 0.673, "mean_token_accuracy": 0.795615553855896, "num_tokens": 10970739.0, "step": 287 }, { "epoch": 0.03663656023406691, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8402934074401855e-06, "grad_norm": 8.439787864685059, "learning_rate": 1.2166172106824924e-07, "loss": 0.7249, "mean_token_accuracy": 0.7892676591873169, "num_tokens": 11003000.0, "step": 288 }, { "epoch": 0.036763770512657425, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8402934074401855e-06, "grad_norm": 7.997354030609131, "learning_rate": 1.2208562950402712e-07, "loss": 0.629, "mean_token_accuracy": 0.8080614805221558, "num_tokens": 11039665.0, "step": 289 }, { "epoch": 0.03689098079124793, "ewc_loss": 0.002777099609375, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.817941665649414e-06, "grad_norm": 7.548630237579346, "learning_rate": 1.22509537939805e-07, "loss": 0.6562, "mean_token_accuracy": 0.7976484298706055, "num_tokens": 11078368.0, "step": 290 }, { "epoch": 0.03701819106983844, "ewc_loss": 0.00274658203125, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.7881393432617188e-06, "grad_norm": 7.643664360046387, "learning_rate": 1.2293344637558286e-07, "loss": 0.6816, "mean_token_accuracy": 0.7972036600112915, "num_tokens": 11122654.0, "step": 291 }, { "epoch": 0.037145401348428954, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8477439880371094e-06, "grad_norm": 8.180115699768066, "learning_rate": 1.2335735481136073e-07, "loss": 0.6374, "mean_token_accuracy": 0.8030790686607361, "num_tokens": 11162622.0, "step": 292 }, { "epoch": 0.03727261162701946, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8775463104248047e-06, "grad_norm": 8.576739311218262, "learning_rate": 1.237812632471386e-07, "loss": 0.6488, "mean_token_accuracy": 0.8055450916290283, "num_tokens": 11202564.0, "step": 293 }, { "epoch": 0.037399821905609976, "ewc_loss": 0.0028533935546875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8924474716186523e-06, "grad_norm": 9.934002876281738, "learning_rate": 1.2420517168291648e-07, "loss": 0.6679, "mean_token_accuracy": 0.7958939075469971, "num_tokens": 11243422.0, "step": 294 }, { "epoch": 0.03752703218420048, "ewc_loss": 0.002899169921875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 8.568209648132324, "learning_rate": 1.2462908011869435e-07, "loss": 0.6486, "mean_token_accuracy": 0.801925778388977, "num_tokens": 11280867.0, "step": 295 }, { "epoch": 0.03765424246279099, "ewc_loss": 0.0028533935546875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8924474716186523e-06, "grad_norm": 8.443742752075195, "learning_rate": 1.2505298855447223e-07, "loss": 0.5963, "mean_token_accuracy": 0.8188095092773438, "num_tokens": 11318454.0, "step": 296 }, { "epoch": 0.037781452741381505, "ewc_loss": 0.0028076171875, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8477439880371094e-06, "grad_norm": 8.96381664276123, "learning_rate": 1.254768969902501e-07, "loss": 0.7386, "mean_token_accuracy": 0.7813609838485718, "num_tokens": 11354721.0, "step": 297 }, { "epoch": 0.03790866301997201, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8775463104248047e-06, "grad_norm": 8.227420806884766, "learning_rate": 1.2590080542602797e-07, "loss": 0.6974, "mean_token_accuracy": 0.7913801074028015, "num_tokens": 11385938.0, "step": 298 }, { "epoch": 0.03803587329856253, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 1.8700957298278809e-06, "grad_norm": 8.464527130126953, "learning_rate": 1.2632471386180584e-07, "loss": 0.6764, "mean_token_accuracy": 0.7964542508125305, "num_tokens": 11424911.0, "step": 299 }, { "epoch": 0.038163083577153034, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.685754776000977e-07, "ewc_loss_parallel": 1.8775463104248047e-06, "grad_norm": 8.405414581298828, "learning_rate": 1.2674862229758372e-07, "loss": 0.643, "mean_token_accuracy": 0.8083763122558594, "num_tokens": 11460580.0, "step": 300 }, { "epoch": 0.03829029385574354, "ewc_loss": 0.0028533935546875, "ewc_loss_diag": 9.685754776000977e-07, "ewc_loss_parallel": 1.8849968910217285e-06, "grad_norm": 8.005681037902832, "learning_rate": 1.271725307333616e-07, "loss": 0.633, "mean_token_accuracy": 0.8101776838302612, "num_tokens": 11501031.0, "step": 301 }, { "epoch": 0.038417504134334056, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 1.8551945686340332e-06, "grad_norm": 7.741772174835205, "learning_rate": 1.2759643916913946e-07, "loss": 0.6425, "mean_token_accuracy": 0.8075206279754639, "num_tokens": 11544053.0, "step": 302 }, { "epoch": 0.03854471441292456, "ewc_loss": 0.002838134765625, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 1.862645149230957e-06, "grad_norm": 10.037664413452148, "learning_rate": 1.2802034760491733e-07, "loss": 0.6599, "mean_token_accuracy": 0.79906165599823, "num_tokens": 11576944.0, "step": 303 }, { "epoch": 0.03867192469151508, "ewc_loss": 0.0029449462890625, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 7.737445831298828, "learning_rate": 1.284442560406952e-07, "loss": 0.7091, "mean_token_accuracy": 0.786274790763855, "num_tokens": 11619735.0, "step": 304 }, { "epoch": 0.038799134970105585, "ewc_loss": 0.00286865234375, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 1.8924474716186523e-06, "grad_norm": 8.031782150268555, "learning_rate": 1.2886816447647308e-07, "loss": 0.7477, "mean_token_accuracy": 0.7746365070343018, "num_tokens": 11655025.0, "step": 305 }, { "epoch": 0.03892634524869609, "ewc_loss": 0.0028533935546875, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 1.8775463104248047e-06, "grad_norm": 8.95789623260498, "learning_rate": 1.2929207291225095e-07, "loss": 0.6436, "mean_token_accuracy": 0.8067617416381836, "num_tokens": 11691970.0, "step": 306 }, { "epoch": 0.03905355552728661, "ewc_loss": 0.0029296875, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.996432304382324, "learning_rate": 1.2971598134802882e-07, "loss": 0.6601, "mean_token_accuracy": 0.7998731732368469, "num_tokens": 11729561.0, "step": 307 }, { "epoch": 0.039180765805877114, "ewc_loss": 0.0029754638671875, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 8.54892635345459, "learning_rate": 1.301398897838067e-07, "loss": 0.6799, "mean_token_accuracy": 0.7924846410751343, "num_tokens": 11765707.0, "step": 308 }, { "epoch": 0.03930797608446762, "ewc_loss": 0.002960205078125, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.450095176696777, "learning_rate": 1.3056379821958457e-07, "loss": 0.6891, "mean_token_accuracy": 0.7981252670288086, "num_tokens": 11806223.0, "step": 309 }, { "epoch": 0.039435186363058136, "ewc_loss": 0.0029296875, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 1.9371509552001953e-06, "grad_norm": 8.036269187927246, "learning_rate": 1.3098770665536244e-07, "loss": 0.6427, "mean_token_accuracy": 0.807303786277771, "num_tokens": 11845477.0, "step": 310 }, { "epoch": 0.039562396641648644, "ewc_loss": 0.002960205078125, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 8.837372779846191, "learning_rate": 1.3141161509114031e-07, "loss": 0.6283, "mean_token_accuracy": 0.8054032325744629, "num_tokens": 11877973.0, "step": 311 }, { "epoch": 0.03968960692023916, "ewc_loss": 0.00299072265625, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 1.9818544387817383e-06, "grad_norm": 8.220643043518066, "learning_rate": 1.3183552352691819e-07, "loss": 0.6368, "mean_token_accuracy": 0.807769238948822, "num_tokens": 11912093.0, "step": 312 }, { "epoch": 0.039816817198829665, "ewc_loss": 0.002960205078125, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 1.952052116394043e-06, "grad_norm": 7.904790878295898, "learning_rate": 1.3225943196269603e-07, "loss": 0.6608, "mean_token_accuracy": 0.8040934801101685, "num_tokens": 11952541.0, "step": 313 }, { "epoch": 0.03994402747742017, "ewc_loss": 0.00299072265625, "ewc_loss_diag": 1.0132789611816406e-06, "ewc_loss_parallel": 1.9669532775878906e-06, "grad_norm": 8.354930877685547, "learning_rate": 1.3268334039847393e-07, "loss": 0.6384, "mean_token_accuracy": 0.8056256771087646, "num_tokens": 11990414.0, "step": 314 }, { "epoch": 0.04007123775601069, "ewc_loss": 0.003021240234375, "ewc_loss_diag": 1.0281801223754883e-06, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 8.59575080871582, "learning_rate": 1.3310724883425178e-07, "loss": 0.7078, "mean_token_accuracy": 0.7842620015144348, "num_tokens": 12025925.0, "step": 315 }, { "epoch": 0.040198448034601195, "ewc_loss": 0.0030517578125, "ewc_loss_diag": 1.0356307029724121e-06, "ewc_loss_parallel": 2.0116567611694336e-06, "grad_norm": 8.332438468933105, "learning_rate": 1.3353115727002968e-07, "loss": 0.6723, "mean_token_accuracy": 0.7978520393371582, "num_tokens": 12068879.0, "step": 316 }, { "epoch": 0.04032565831319171, "ewc_loss": 0.003021240234375, "ewc_loss_diag": 1.0281801223754883e-06, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 8.369187355041504, "learning_rate": 1.3395506570580752e-07, "loss": 0.6387, "mean_token_accuracy": 0.806148111820221, "num_tokens": 12111931.0, "step": 317 }, { "epoch": 0.040452868591782216, "ewc_loss": 0.0030364990234375, "ewc_loss_diag": 1.0356307029724121e-06, "ewc_loss_parallel": 1.996755599975586e-06, "grad_norm": 8.404977798461914, "learning_rate": 1.3437897414158542e-07, "loss": 0.7094, "mean_token_accuracy": 0.7836259603500366, "num_tokens": 12140431.0, "step": 318 }, { "epoch": 0.040580078870372724, "ewc_loss": 0.003082275390625, "ewc_loss_diag": 1.0505318641662598e-06, "ewc_loss_parallel": 2.0265579223632812e-06, "grad_norm": 8.949193954467773, "learning_rate": 1.3480288257736327e-07, "loss": 0.6358, "mean_token_accuracy": 0.8063379526138306, "num_tokens": 12173932.0, "step": 319 }, { "epoch": 0.04070728914896324, "ewc_loss": 0.003082275390625, "ewc_loss_diag": 1.0579824447631836e-06, "ewc_loss_parallel": 2.0265579223632812e-06, "grad_norm": 8.272421836853027, "learning_rate": 1.3522679101314117e-07, "loss": 0.6154, "mean_token_accuracy": 0.8103653788566589, "num_tokens": 12210040.0, "step": 320 }, { "epoch": 0.040834499427553746, "ewc_loss": 0.003082275390625, "ewc_loss_diag": 1.0579824447631836e-06, "ewc_loss_parallel": 2.0116567611694336e-06, "grad_norm": 7.797531604766846, "learning_rate": 1.35650699448919e-07, "loss": 0.6466, "mean_token_accuracy": 0.8055108785629272, "num_tokens": 12248556.0, "step": 321 }, { "epoch": 0.04096170970614425, "ewc_loss": 0.0030975341796875, "ewc_loss_diag": 1.0654330253601074e-06, "ewc_loss_parallel": 2.0265579223632812e-06, "grad_norm": 8.035216331481934, "learning_rate": 1.360746078846969e-07, "loss": 0.6356, "mean_token_accuracy": 0.8066294193267822, "num_tokens": 12285664.0, "step": 322 }, { "epoch": 0.04108891998473477, "ewc_loss": 0.003082275390625, "ewc_loss_diag": 1.0579824447631836e-06, "ewc_loss_parallel": 2.0265579223632812e-06, "grad_norm": 9.03209400177002, "learning_rate": 1.3649851632047476e-07, "loss": 0.6891, "mean_token_accuracy": 0.7924723625183105, "num_tokens": 12326964.0, "step": 323 }, { "epoch": 0.041216130263325275, "ewc_loss": 0.003173828125, "ewc_loss_diag": 1.0654330253601074e-06, "ewc_loss_parallel": 2.1010637283325195e-06, "grad_norm": 9.254371643066406, "learning_rate": 1.3692242475625266e-07, "loss": 0.597, "mean_token_accuracy": 0.8213058710098267, "num_tokens": 12366541.0, "step": 324 }, { "epoch": 0.04134334054191579, "ewc_loss": 0.003204345703125, "ewc_loss_diag": 1.0728836059570312e-06, "ewc_loss_parallel": 2.115964889526367e-06, "grad_norm": 9.4032621383667, "learning_rate": 1.373463331920305e-07, "loss": 0.6595, "mean_token_accuracy": 0.8068408966064453, "num_tokens": 12405664.0, "step": 325 }, { "epoch": 0.0414705508205063, "ewc_loss": 0.003173828125, "ewc_loss_diag": 1.0654330253601074e-06, "ewc_loss_parallel": 2.1010637283325195e-06, "grad_norm": 8.89141845703125, "learning_rate": 1.377702416278084e-07, "loss": 0.7242, "mean_token_accuracy": 0.7828853130340576, "num_tokens": 12445039.0, "step": 326 }, { "epoch": 0.041597761099096804, "ewc_loss": 0.003173828125, "ewc_loss_diag": 1.0728836059570312e-06, "ewc_loss_parallel": 2.086162567138672e-06, "grad_norm": 8.304805755615234, "learning_rate": 1.3819415006358625e-07, "loss": 0.5937, "mean_token_accuracy": 0.8194682002067566, "num_tokens": 12485481.0, "step": 327 }, { "epoch": 0.04172497137768732, "ewc_loss": 0.003143310546875, "ewc_loss_diag": 1.0728836059570312e-06, "ewc_loss_parallel": 2.0712614059448242e-06, "grad_norm": 8.343082427978516, "learning_rate": 1.3861805849936415e-07, "loss": 0.6196, "mean_token_accuracy": 0.8152792453765869, "num_tokens": 12530272.0, "step": 328 }, { "epoch": 0.041852181656277826, "ewc_loss": 0.003173828125, "ewc_loss_diag": 1.087784767150879e-06, "ewc_loss_parallel": 2.086162567138672e-06, "grad_norm": 10.097613334655762, "learning_rate": 1.39041966935142e-07, "loss": 0.6668, "mean_token_accuracy": 0.798613965511322, "num_tokens": 12565854.0, "step": 329 }, { "epoch": 0.04197939193486834, "ewc_loss": 0.0032806396484375, "ewc_loss_diag": 1.0952353477478027e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.930286407470703, "learning_rate": 1.394658753709199e-07, "loss": 0.6515, "mean_token_accuracy": 0.8035513162612915, "num_tokens": 12607313.0, "step": 330 }, { "epoch": 0.04210660221345885, "ewc_loss": 0.003265380859375, "ewc_loss_diag": 1.1026859283447266e-06, "ewc_loss_parallel": 2.1457672119140625e-06, "grad_norm": 8.506377220153809, "learning_rate": 1.3988978380669774e-07, "loss": 0.6891, "mean_token_accuracy": 0.7923682332038879, "num_tokens": 12642057.0, "step": 331 }, { "epoch": 0.042233812492049355, "ewc_loss": 0.0032196044921875, "ewc_loss_diag": 1.1175870895385742e-06, "ewc_loss_parallel": 2.1010637283325195e-06, "grad_norm": 8.196331977844238, "learning_rate": 1.403136922424756e-07, "loss": 0.6792, "mean_token_accuracy": 0.7950947880744934, "num_tokens": 12684118.0, "step": 332 }, { "epoch": 0.04236102277063987, "ewc_loss": 0.0032196044921875, "ewc_loss_diag": 1.1175870895385742e-06, "ewc_loss_parallel": 2.1010637283325195e-06, "grad_norm": 8.984540939331055, "learning_rate": 1.4073760067825348e-07, "loss": 0.6448, "mean_token_accuracy": 0.8035979270935059, "num_tokens": 12721945.0, "step": 333 }, { "epoch": 0.04248823304923038, "ewc_loss": 0.0032958984375, "ewc_loss_diag": 1.125037670135498e-06, "ewc_loss_parallel": 2.16066837310791e-06, "grad_norm": 8.797715187072754, "learning_rate": 1.4116150911403136e-07, "loss": 0.6579, "mean_token_accuracy": 0.8014206290245056, "num_tokens": 12764175.0, "step": 334 }, { "epoch": 0.04261544332782089, "ewc_loss": 0.0033111572265625, "ewc_loss_diag": 1.1324882507324219e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.521462440490723, "learning_rate": 1.4158541754980923e-07, "loss": 0.6093, "mean_token_accuracy": 0.8165260553359985, "num_tokens": 12801600.0, "step": 335 }, { "epoch": 0.0427426536064114, "ewc_loss": 0.0032806396484375, "ewc_loss_diag": 1.1324882507324219e-06, "ewc_loss_parallel": 2.1457672119140625e-06, "grad_norm": 10.009198188781738, "learning_rate": 1.420093259855871e-07, "loss": 0.641, "mean_token_accuracy": 0.8115247488021851, "num_tokens": 12843584.0, "step": 336 }, { "epoch": 0.042869863885001906, "ewc_loss": 0.003326416015625, "ewc_loss_diag": 1.125037670135498e-06, "ewc_loss_parallel": 2.205371856689453e-06, "grad_norm": 8.954092979431152, "learning_rate": 1.4243323442136497e-07, "loss": 0.656, "mean_token_accuracy": 0.8016359210014343, "num_tokens": 12878105.0, "step": 337 }, { "epoch": 0.04299707416359242, "ewc_loss": 0.0033111572265625, "ewc_loss_diag": 1.1324882507324219e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.152996063232422, "learning_rate": 1.4285714285714285e-07, "loss": 0.6638, "mean_token_accuracy": 0.8001242876052856, "num_tokens": 12911712.0, "step": 338 }, { "epoch": 0.04312428444218293, "ewc_loss": 0.0032958984375, "ewc_loss_diag": 1.1399388313293457e-06, "ewc_loss_parallel": 2.1457672119140625e-06, "grad_norm": 8.728755950927734, "learning_rate": 1.4328105129292072e-07, "loss": 0.6435, "mean_token_accuracy": 0.806803286075592, "num_tokens": 12951303.0, "step": 339 }, { "epoch": 0.043251494720773435, "ewc_loss": 0.003326416015625, "ewc_loss_diag": 1.1399388313293457e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.778334617614746, "learning_rate": 1.437049597286986e-07, "loss": 0.6308, "mean_token_accuracy": 0.8094000816345215, "num_tokens": 12990544.0, "step": 340 }, { "epoch": 0.04337870499936395, "ewc_loss": 0.0033416748046875, "ewc_loss_diag": 1.1622905731201172e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.647377967834473, "learning_rate": 1.4412886816447646e-07, "loss": 0.6904, "mean_token_accuracy": 0.7909131646156311, "num_tokens": 13023766.0, "step": 341 }, { "epoch": 0.04350591527795446, "ewc_loss": 0.0033416748046875, "ewc_loss_diag": 1.1622905731201172e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 8.656591415405273, "learning_rate": 1.4455277660025434e-07, "loss": 0.6047, "mean_token_accuracy": 0.8157284259796143, "num_tokens": 13056337.0, "step": 342 }, { "epoch": 0.04363312555654497, "ewc_loss": 0.00335693359375, "ewc_loss_diag": 1.1622905731201172e-06, "ewc_loss_parallel": 2.1904706954956055e-06, "grad_norm": 8.248353004455566, "learning_rate": 1.449766850360322e-07, "loss": 0.6609, "mean_token_accuracy": 0.8018357157707214, "num_tokens": 13097345.0, "step": 343 }, { "epoch": 0.04376033583513548, "ewc_loss": 0.00335693359375, "ewc_loss_diag": 1.169741153717041e-06, "ewc_loss_parallel": 2.175569534301758e-06, "grad_norm": 10.7830228805542, "learning_rate": 1.4540059347181008e-07, "loss": 0.6485, "mean_token_accuracy": 0.8038197755813599, "num_tokens": 13127332.0, "step": 344 }, { "epoch": 0.043887546113725986, "ewc_loss": 0.0034942626953125, "ewc_loss_diag": 1.1920928955078125e-06, "ewc_loss_parallel": 2.3096799850463867e-06, "grad_norm": 8.58588695526123, "learning_rate": 1.4582450190758795e-07, "loss": 0.6762, "mean_token_accuracy": 0.7948005199432373, "num_tokens": 13170760.0, "step": 345 }, { "epoch": 0.0440147563923165, "ewc_loss": 0.003387451171875, "ewc_loss_diag": 1.1995434761047363e-06, "ewc_loss_parallel": 2.1904706954956055e-06, "grad_norm": 8.634429931640625, "learning_rate": 1.4624841034336583e-07, "loss": 0.646, "mean_token_accuracy": 0.806535005569458, "num_tokens": 13207061.0, "step": 346 }, { "epoch": 0.04414196667090701, "ewc_loss": 0.003387451171875, "ewc_loss_diag": 1.1995434761047363e-06, "ewc_loss_parallel": 2.1904706954956055e-06, "grad_norm": 8.447843551635742, "learning_rate": 1.466723187791437e-07, "loss": 0.6127, "mean_token_accuracy": 0.81451815366745, "num_tokens": 13247903.0, "step": 347 }, { "epoch": 0.04426917694949752, "ewc_loss": 0.00341796875, "ewc_loss_diag": 1.1995434761047363e-06, "ewc_loss_parallel": 2.205371856689453e-06, "grad_norm": 8.291492462158203, "learning_rate": 1.4709622721492157e-07, "loss": 0.623, "mean_token_accuracy": 0.8069890737533569, "num_tokens": 13287193.0, "step": 348 }, { "epoch": 0.04439638722808803, "ewc_loss": 0.00341796875, "ewc_loss_diag": 1.1995434761047363e-06, "ewc_loss_parallel": 2.2202730178833008e-06, "grad_norm": 8.801652908325195, "learning_rate": 1.4752013565069942e-07, "loss": 0.6283, "mean_token_accuracy": 0.8092055320739746, "num_tokens": 13323112.0, "step": 349 }, { "epoch": 0.04452359750667854, "ewc_loss": 0.00347900390625, "ewc_loss_diag": 1.214444637298584e-06, "ewc_loss_parallel": 2.2798776626586914e-06, "grad_norm": 8.83452033996582, "learning_rate": 1.4794404408647732e-07, "loss": 0.6239, "mean_token_accuracy": 0.8118606805801392, "num_tokens": 13359653.0, "step": 350 }, { "epoch": 0.04465080778526905, "ewc_loss": 0.0034942626953125, "ewc_loss_diag": 1.2069940567016602e-06, "ewc_loss_parallel": 2.294778823852539e-06, "grad_norm": 8.683308601379395, "learning_rate": 1.4836795252225516e-07, "loss": 0.5954, "mean_token_accuracy": 0.816986083984375, "num_tokens": 13400571.0, "step": 351 }, { "epoch": 0.04477801806385956, "ewc_loss": 0.003509521484375, "ewc_loss_diag": 1.214444637298584e-06, "ewc_loss_parallel": 2.294778823852539e-06, "grad_norm": 9.172021865844727, "learning_rate": 1.4879186095803306e-07, "loss": 0.5729, "mean_token_accuracy": 0.8202654123306274, "num_tokens": 13437656.0, "step": 352 }, { "epoch": 0.04490522834245007, "ewc_loss": 0.0035400390625, "ewc_loss_diag": 1.214444637298584e-06, "ewc_loss_parallel": 2.3245811462402344e-06, "grad_norm": 8.333738327026367, "learning_rate": 1.492157693938109e-07, "loss": 0.6323, "mean_token_accuracy": 0.8076384663581848, "num_tokens": 13474688.0, "step": 353 }, { "epoch": 0.04503243862104058, "ewc_loss": 0.003509521484375, "ewc_loss_diag": 1.2218952178955078e-06, "ewc_loss_parallel": 2.294778823852539e-06, "grad_norm": 8.168127059936523, "learning_rate": 1.496396778295888e-07, "loss": 0.5686, "mean_token_accuracy": 0.8249081373214722, "num_tokens": 13518256.0, "step": 354 }, { "epoch": 0.04515964889963109, "ewc_loss": 0.00347900390625, "ewc_loss_diag": 1.2293457984924316e-06, "ewc_loss_parallel": 2.2649765014648438e-06, "grad_norm": 9.323710441589355, "learning_rate": 1.5006358626536665e-07, "loss": 0.6566, "mean_token_accuracy": 0.80315101146698, "num_tokens": 13554315.0, "step": 355 }, { "epoch": 0.0452868591782216, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.2367963790893555e-06, "ewc_loss_parallel": 2.3692846298217773e-06, "grad_norm": 8.694113731384277, "learning_rate": 1.5048749470114455e-07, "loss": 0.6784, "mean_token_accuracy": 0.7960365414619446, "num_tokens": 13594388.0, "step": 356 }, { "epoch": 0.04541406945681211, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.2665987014770508e-06, "ewc_loss_parallel": 2.339482307434082e-06, "grad_norm": 9.255871772766113, "learning_rate": 1.509114031369224e-07, "loss": 0.6032, "mean_token_accuracy": 0.8142014741897583, "num_tokens": 13633704.0, "step": 357 }, { "epoch": 0.04554127973540262, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.2442469596862793e-06, "ewc_loss_parallel": 2.3543834686279297e-06, "grad_norm": 8.82101821899414, "learning_rate": 1.513353115727003e-07, "loss": 0.6545, "mean_token_accuracy": 0.7972715497016907, "num_tokens": 13671183.0, "step": 358 }, { "epoch": 0.04566849001399313, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 2.339482307434082e-06, "grad_norm": 8.915841102600098, "learning_rate": 1.5175922000847814e-07, "loss": 0.6087, "mean_token_accuracy": 0.8140856027603149, "num_tokens": 13709640.0, "step": 359 }, { "epoch": 0.04579570029258364, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 2.3543834686279297e-06, "grad_norm": 10.185287475585938, "learning_rate": 1.5218312844425604e-07, "loss": 0.6303, "mean_token_accuracy": 0.8015191555023193, "num_tokens": 13751114.0, "step": 360 }, { "epoch": 0.045922910571174154, "ewc_loss": 0.0036773681640625, "ewc_loss_diag": 1.2665987014770508e-06, "ewc_loss_parallel": 2.4139881134033203e-06, "grad_norm": 9.043583869934082, "learning_rate": 1.526070368800339e-07, "loss": 0.5617, "mean_token_accuracy": 0.8228519558906555, "num_tokens": 13786448.0, "step": 361 }, { "epoch": 0.04605012084976466, "ewc_loss": 0.003631591796875, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 2.3692846298217773e-06, "grad_norm": 8.487922668457031, "learning_rate": 1.530309453158118e-07, "loss": 0.5844, "mean_token_accuracy": 0.8257285356521606, "num_tokens": 13827663.0, "step": 362 }, { "epoch": 0.04617733112835517, "ewc_loss": 0.003570556640625, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 2.3245811462402344e-06, "grad_norm": 8.477723121643066, "learning_rate": 1.5345485375158963e-07, "loss": 0.6626, "mean_token_accuracy": 0.8001877665519714, "num_tokens": 13866078.0, "step": 363 }, { "epoch": 0.04630454140694568, "ewc_loss": 0.00360107421875, "ewc_loss_diag": 1.2665987014770508e-06, "ewc_loss_parallel": 2.339482307434082e-06, "grad_norm": 8.852173805236816, "learning_rate": 1.5387876218736753e-07, "loss": 0.5406, "mean_token_accuracy": 0.8321781158447266, "num_tokens": 13909767.0, "step": 364 }, { "epoch": 0.04643175168553619, "ewc_loss": 0.003631591796875, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 2.3692846298217773e-06, "grad_norm": 8.510660171508789, "learning_rate": 1.5430267062314538e-07, "loss": 0.6488, "mean_token_accuracy": 0.8053404092788696, "num_tokens": 13948937.0, "step": 365 }, { "epoch": 0.0465589619641267, "ewc_loss": 0.003631591796875, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 2.3692846298217773e-06, "grad_norm": 9.281691551208496, "learning_rate": 1.5472657905892328e-07, "loss": 0.6602, "mean_token_accuracy": 0.7980964183807373, "num_tokens": 13984039.0, "step": 366 }, { "epoch": 0.04668617224271721, "ewc_loss": 0.003692626953125, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 2.4139881134033203e-06, "grad_norm": 9.448159217834473, "learning_rate": 1.5515048749470113e-07, "loss": 0.6438, "mean_token_accuracy": 0.8033883571624756, "num_tokens": 14018162.0, "step": 367 }, { "epoch": 0.04681338252130772, "ewc_loss": 0.0036773681640625, "ewc_loss_diag": 1.2814998626708984e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 8.924078941345215, "learning_rate": 1.55574395930479e-07, "loss": 0.6426, "mean_token_accuracy": 0.8044292330741882, "num_tokens": 14056493.0, "step": 368 }, { "epoch": 0.046940592799898234, "ewc_loss": 0.0036773681640625, "ewc_loss_diag": 1.2814998626708984e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 9.14168643951416, "learning_rate": 1.5599830436625687e-07, "loss": 0.6371, "mean_token_accuracy": 0.8075697422027588, "num_tokens": 14097530.0, "step": 369 }, { "epoch": 0.04706780307848874, "ewc_loss": 0.003692626953125, "ewc_loss_diag": 1.2889504432678223e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 8.985250473022461, "learning_rate": 1.5642221280203474e-07, "loss": 0.6549, "mean_token_accuracy": 0.8056047558784485, "num_tokens": 14136240.0, "step": 370 }, { "epoch": 0.04719501335707925, "ewc_loss": 0.003692626953125, "ewc_loss_diag": 1.296401023864746e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 9.091946601867676, "learning_rate": 1.5684612123781262e-07, "loss": 0.5651, "mean_token_accuracy": 0.8270126581192017, "num_tokens": 14171010.0, "step": 371 }, { "epoch": 0.04732222363566976, "ewc_loss": 0.0037078857421875, "ewc_loss_diag": 1.3113021850585938e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 8.266680717468262, "learning_rate": 1.572700296735905e-07, "loss": 0.6353, "mean_token_accuracy": 0.8123961091041565, "num_tokens": 14206985.0, "step": 372 }, { "epoch": 0.04744943391426027, "ewc_loss": 0.00372314453125, "ewc_loss_diag": 1.3336539268493652e-06, "ewc_loss_parallel": 2.3990869522094727e-06, "grad_norm": 9.31215763092041, "learning_rate": 1.576939381093684e-07, "loss": 0.6351, "mean_token_accuracy": 0.8039594292640686, "num_tokens": 14236728.0, "step": 373 }, { "epoch": 0.047576644192850785, "ewc_loss": 0.0037841796875, "ewc_loss_diag": 1.3336539268493652e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 9.62464427947998, "learning_rate": 1.5811784654514623e-07, "loss": 0.5614, "mean_token_accuracy": 0.8279512524604797, "num_tokens": 14270758.0, "step": 374 }, { "epoch": 0.04770385447144129, "ewc_loss": 0.0037841796875, "ewc_loss_diag": 1.3336539268493652e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 8.698525428771973, "learning_rate": 1.5854175498092413e-07, "loss": 0.6792, "mean_token_accuracy": 0.7911028861999512, "num_tokens": 14315002.0, "step": 375 }, { "epoch": 0.0478310647500318, "ewc_loss": 0.003753662109375, "ewc_loss_diag": 1.3485550880432129e-06, "ewc_loss_parallel": 2.4139881134033203e-06, "grad_norm": 8.736621856689453, "learning_rate": 1.5896566341670198e-07, "loss": 0.6496, "mean_token_accuracy": 0.7994215488433838, "num_tokens": 14353385.0, "step": 376 }, { "epoch": 0.047958275028622314, "ewc_loss": 0.0037994384765625, "ewc_loss_diag": 1.341104507446289e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 10.998238563537598, "learning_rate": 1.5938957185247988e-07, "loss": 0.6825, "mean_token_accuracy": 0.7892805337905884, "num_tokens": 14391702.0, "step": 377 }, { "epoch": 0.04808548530721282, "ewc_loss": 0.00390625, "ewc_loss_diag": 1.3485550880432129e-06, "ewc_loss_parallel": 2.562999725341797e-06, "grad_norm": 9.651021957397461, "learning_rate": 1.5981348028825772e-07, "loss": 0.6508, "mean_token_accuracy": 0.8021162152290344, "num_tokens": 14427876.0, "step": 378 }, { "epoch": 0.048212695585803336, "ewc_loss": 0.00384521484375, "ewc_loss_diag": 1.3560056686401367e-06, "ewc_loss_parallel": 2.4884939193725586e-06, "grad_norm": 9.649397850036621, "learning_rate": 1.6023738872403562e-07, "loss": 0.7385, "mean_token_accuracy": 0.7746663689613342, "num_tokens": 14460349.0, "step": 379 }, { "epoch": 0.048339905864393844, "ewc_loss": 0.003814697265625, "ewc_loss_diag": 1.3560056686401367e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 8.979399681091309, "learning_rate": 1.6066129715981347e-07, "loss": 0.5868, "mean_token_accuracy": 0.8228420615196228, "num_tokens": 14497763.0, "step": 380 }, { "epoch": 0.04846711614298435, "ewc_loss": 0.003814697265625, "ewc_loss_diag": 1.3634562492370605e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 9.013592720031738, "learning_rate": 1.6108520559559137e-07, "loss": 0.5797, "mean_token_accuracy": 0.820466160774231, "num_tokens": 14537172.0, "step": 381 }, { "epoch": 0.048594326421574865, "ewc_loss": 0.003814697265625, "ewc_loss_diag": 1.3634562492370605e-06, "ewc_loss_parallel": 2.4586915969848633e-06, "grad_norm": 10.02580738067627, "learning_rate": 1.6150911403136921e-07, "loss": 0.6682, "mean_token_accuracy": 0.8023302555084229, "num_tokens": 14565515.0, "step": 382 }, { "epoch": 0.04872153670016537, "ewc_loss": 0.00390625, "ewc_loss_diag": 1.3783574104309082e-06, "ewc_loss_parallel": 2.518296241760254e-06, "grad_norm": 8.490594863891602, "learning_rate": 1.619330224671471e-07, "loss": 0.6579, "mean_token_accuracy": 0.8015102744102478, "num_tokens": 14608104.0, "step": 383 }, { "epoch": 0.04884874697875588, "ewc_loss": 0.003814697265625, "ewc_loss_diag": 1.3783574104309082e-06, "ewc_loss_parallel": 2.4437904357910156e-06, "grad_norm": 9.263189315795898, "learning_rate": 1.6235693090292496e-07, "loss": 0.6648, "mean_token_accuracy": 0.8019616603851318, "num_tokens": 14645328.0, "step": 384 }, { "epoch": 0.048975957257346395, "ewc_loss": 0.0038909912109375, "ewc_loss_diag": 1.385807991027832e-06, "ewc_loss_parallel": 2.5033950805664062e-06, "grad_norm": 9.499841690063477, "learning_rate": 1.6278083933870286e-07, "loss": 0.7023, "mean_token_accuracy": 0.7832050323486328, "num_tokens": 14678792.0, "step": 385 }, { "epoch": 0.0491031675359369, "ewc_loss": 0.003936767578125, "ewc_loss_diag": 1.3932585716247559e-06, "ewc_loss_parallel": 2.5331974029541016e-06, "grad_norm": 8.880697250366211, "learning_rate": 1.632047477744807e-07, "loss": 0.6235, "mean_token_accuracy": 0.8110737800598145, "num_tokens": 14715095.0, "step": 386 }, { "epoch": 0.049230377814527417, "ewc_loss": 0.00390625, "ewc_loss_diag": 1.4007091522216797e-06, "ewc_loss_parallel": 2.5033950805664062e-06, "grad_norm": 8.982095718383789, "learning_rate": 1.6362865621025858e-07, "loss": 0.6428, "mean_token_accuracy": 0.8054865598678589, "num_tokens": 14753641.0, "step": 387 }, { "epoch": 0.049357588093117924, "ewc_loss": 0.003936767578125, "ewc_loss_diag": 1.4081597328186035e-06, "ewc_loss_parallel": 2.518296241760254e-06, "grad_norm": 9.064177513122559, "learning_rate": 1.6405256464603645e-07, "loss": 0.6143, "mean_token_accuracy": 0.8106854557991028, "num_tokens": 14788816.0, "step": 388 }, { "epoch": 0.04948479837170843, "ewc_loss": 0.003936767578125, "ewc_loss_diag": 1.4081597328186035e-06, "ewc_loss_parallel": 2.5331974029541016e-06, "grad_norm": 9.844646453857422, "learning_rate": 1.6447647308181432e-07, "loss": 0.6589, "mean_token_accuracy": 0.7986175417900085, "num_tokens": 14821607.0, "step": 389 }, { "epoch": 0.049612008650298946, "ewc_loss": 0.003997802734375, "ewc_loss_diag": 1.4230608940124512e-06, "ewc_loss_parallel": 2.562999725341797e-06, "grad_norm": 8.94582462310791, "learning_rate": 1.649003815175922e-07, "loss": 0.6752, "mean_token_accuracy": 0.7971038818359375, "num_tokens": 14861298.0, "step": 390 }, { "epoch": 0.04973921892888945, "ewc_loss": 0.003936767578125, "ewc_loss_diag": 1.4156103134155273e-06, "ewc_loss_parallel": 2.518296241760254e-06, "grad_norm": 8.89790153503418, "learning_rate": 1.6532428995337007e-07, "loss": 0.5914, "mean_token_accuracy": 0.8188658952713013, "num_tokens": 14899951.0, "step": 391 }, { "epoch": 0.04986642920747997, "ewc_loss": 0.00396728515625, "ewc_loss_diag": 1.4230608940124512e-06, "ewc_loss_parallel": 2.5331974029541016e-06, "grad_norm": 8.364798545837402, "learning_rate": 1.6574819838914794e-07, "loss": 0.5711, "mean_token_accuracy": 0.8267093896865845, "num_tokens": 14937169.0, "step": 392 }, { "epoch": 0.049993639486070475, "ewc_loss": 0.00396728515625, "ewc_loss_diag": 1.430511474609375e-06, "ewc_loss_parallel": 2.5331974029541016e-06, "grad_norm": 8.738944053649902, "learning_rate": 1.661721068249258e-07, "loss": 0.6258, "mean_token_accuracy": 0.8093734383583069, "num_tokens": 14975549.0, "step": 393 }, { "epoch": 0.05012084976466098, "ewc_loss": 0.00396728515625, "ewc_loss_diag": 1.4379620552062988e-06, "ewc_loss_parallel": 2.5480985641479492e-06, "grad_norm": 8.955483436584473, "learning_rate": 1.6659601526070368e-07, "loss": 0.6524, "mean_token_accuracy": 0.8034367561340332, "num_tokens": 15016630.0, "step": 394 }, { "epoch": 0.0502480600432515, "ewc_loss": 0.0040283203125, "ewc_loss_diag": 1.4379620552062988e-06, "ewc_loss_parallel": 2.5779008865356445e-06, "grad_norm": 8.701189041137695, "learning_rate": 1.6701992369648156e-07, "loss": 0.6334, "mean_token_accuracy": 0.8060410618782043, "num_tokens": 15057822.0, "step": 395 }, { "epoch": 0.050375270321842004, "ewc_loss": 0.0040283203125, "ewc_loss_diag": 1.4528632164001465e-06, "ewc_loss_parallel": 2.562999725341797e-06, "grad_norm": 9.952896118164062, "learning_rate": 1.6744383213225943e-07, "loss": 0.6072, "mean_token_accuracy": 0.8091784715652466, "num_tokens": 15094385.0, "step": 396 }, { "epoch": 0.05050248060043251, "ewc_loss": 0.004058837890625, "ewc_loss_diag": 1.4454126358032227e-06, "ewc_loss_parallel": 2.6226043701171875e-06, "grad_norm": 8.857146263122559, "learning_rate": 1.678677405680373e-07, "loss": 0.5666, "mean_token_accuracy": 0.8221498727798462, "num_tokens": 15130969.0, "step": 397 }, { "epoch": 0.050629690879023026, "ewc_loss": 0.0040283203125, "ewc_loss_diag": 1.4528632164001465e-06, "ewc_loss_parallel": 2.562999725341797e-06, "grad_norm": 9.73171329498291, "learning_rate": 1.6829164900381518e-07, "loss": 0.5877, "mean_token_accuracy": 0.8157630562782288, "num_tokens": 15166586.0, "step": 398 }, { "epoch": 0.05075690115761353, "ewc_loss": 0.0040283203125, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 2.5779008865356445e-06, "grad_norm": 9.500273704528809, "learning_rate": 1.6871555743959305e-07, "loss": 0.5778, "mean_token_accuracy": 0.8192496299743652, "num_tokens": 15209603.0, "step": 399 }, { "epoch": 0.05088411143620405, "ewc_loss": 0.004058837890625, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 2.592802047729492e-06, "grad_norm": 9.825133323669434, "learning_rate": 1.6913946587537092e-07, "loss": 0.5954, "mean_token_accuracy": 0.8154278993606567, "num_tokens": 15251542.0, "step": 400 }, { "epoch": 0.051011321714794555, "ewc_loss": 0.00408935546875, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 2.60770320892334e-06, "grad_norm": 9.419018745422363, "learning_rate": 1.695633743111488e-07, "loss": 0.6103, "mean_token_accuracy": 0.8151687383651733, "num_tokens": 15288940.0, "step": 401 }, { "epoch": 0.05113853199338506, "ewc_loss": 0.004058837890625, "ewc_loss_diag": 1.475214958190918e-06, "ewc_loss_parallel": 2.592802047729492e-06, "grad_norm": 10.228755950927734, "learning_rate": 1.6998728274692667e-07, "loss": 0.5776, "mean_token_accuracy": 0.8251042366027832, "num_tokens": 15321965.0, "step": 402 }, { "epoch": 0.05126574227197558, "ewc_loss": 0.004119873046875, "ewc_loss_diag": 1.4826655387878418e-06, "ewc_loss_parallel": 2.637505531311035e-06, "grad_norm": 9.599833488464355, "learning_rate": 1.7041119118270454e-07, "loss": 0.5843, "mean_token_accuracy": 0.8218507170677185, "num_tokens": 15362895.0, "step": 403 }, { "epoch": 0.051392952550566084, "ewc_loss": 0.004058837890625, "ewc_loss_diag": 1.4901161193847656e-06, "ewc_loss_parallel": 2.5779008865356445e-06, "grad_norm": 10.243278503417969, "learning_rate": 1.7083509961848238e-07, "loss": 0.6464, "mean_token_accuracy": 0.8021963834762573, "num_tokens": 15396675.0, "step": 404 }, { "epoch": 0.0515201628291566, "ewc_loss": 0.004119873046875, "ewc_loss_diag": 1.4901161193847656e-06, "ewc_loss_parallel": 2.637505531311035e-06, "grad_norm": 8.8756103515625, "learning_rate": 1.7125900805426028e-07, "loss": 0.6656, "mean_token_accuracy": 0.7974413633346558, "num_tokens": 15443070.0, "step": 405 }, { "epoch": 0.051647373107747106, "ewc_loss": 0.00408935546875, "ewc_loss_diag": 1.4975666999816895e-06, "ewc_loss_parallel": 2.5779008865356445e-06, "grad_norm": 9.190844535827637, "learning_rate": 1.7168291649003813e-07, "loss": 0.5934, "mean_token_accuracy": 0.8160126209259033, "num_tokens": 15476747.0, "step": 406 }, { "epoch": 0.051774583386337614, "ewc_loss": 0.00408935546875, "ewc_loss_diag": 1.4975666999816895e-06, "ewc_loss_parallel": 2.592802047729492e-06, "grad_norm": 9.253902435302734, "learning_rate": 1.7210682492581603e-07, "loss": 0.6178, "mean_token_accuracy": 0.8119809031486511, "num_tokens": 15514866.0, "step": 407 }, { "epoch": 0.05190179366492813, "ewc_loss": 0.004150390625, "ewc_loss_diag": 1.4975666999816895e-06, "ewc_loss_parallel": 2.652406692504883e-06, "grad_norm": 9.565776824951172, "learning_rate": 1.7253073336159387e-07, "loss": 0.5938, "mean_token_accuracy": 0.8162865042686462, "num_tokens": 15554245.0, "step": 408 }, { "epoch": 0.052029003943518635, "ewc_loss": 0.004150390625, "ewc_loss_diag": 1.4901161193847656e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.721236228942871, "learning_rate": 1.7295464179737177e-07, "loss": 0.6222, "mean_token_accuracy": 0.8115763664245605, "num_tokens": 15593958.0, "step": 409 }, { "epoch": 0.05215621422210915, "ewc_loss": 0.004150390625, "ewc_loss_diag": 1.5050172805786133e-06, "ewc_loss_parallel": 2.652406692504883e-06, "grad_norm": 8.986916542053223, "learning_rate": 1.7337855023314962e-07, "loss": 0.5677, "mean_token_accuracy": 0.8249009251594543, "num_tokens": 15632567.0, "step": 410 }, { "epoch": 0.05228342450069966, "ewc_loss": 0.00408935546875, "ewc_loss_diag": 1.5124678611755371e-06, "ewc_loss_parallel": 2.562999725341797e-06, "grad_norm": 8.658182144165039, "learning_rate": 1.7380245866892752e-07, "loss": 0.5788, "mean_token_accuracy": 0.8205491304397583, "num_tokens": 15668255.0, "step": 411 }, { "epoch": 0.052410634779290165, "ewc_loss": 0.004119873046875, "ewc_loss_diag": 1.5050172805786133e-06, "ewc_loss_parallel": 2.60770320892334e-06, "grad_norm": 9.060068130493164, "learning_rate": 1.7422636710470536e-07, "loss": 0.6314, "mean_token_accuracy": 0.8052312135696411, "num_tokens": 15705435.0, "step": 412 }, { "epoch": 0.05253784505788068, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5273690223693848e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.134624481201172, "learning_rate": 1.7465027554048326e-07, "loss": 0.6521, "mean_token_accuracy": 0.8008342385292053, "num_tokens": 15744657.0, "step": 413 }, { "epoch": 0.05266505533647119, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5273690223693848e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.222827911376953, "learning_rate": 1.750741839762611e-07, "loss": 0.5931, "mean_token_accuracy": 0.8182229399681091, "num_tokens": 15777374.0, "step": 414 }, { "epoch": 0.052792265615061694, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5273690223693848e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.194140434265137, "learning_rate": 1.75498092412039e-07, "loss": 0.5662, "mean_token_accuracy": 0.8260390758514404, "num_tokens": 15814047.0, "step": 415 }, { "epoch": 0.05291947589365221, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5497207641601562e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 8.94787311553955, "learning_rate": 1.7592200084781686e-07, "loss": 0.6495, "mean_token_accuracy": 0.8000749349594116, "num_tokens": 15849110.0, "step": 416 }, { "epoch": 0.053046686172242716, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5497207641601562e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 8.863934516906738, "learning_rate": 1.7634590928359475e-07, "loss": 0.618, "mean_token_accuracy": 0.8134540319442749, "num_tokens": 15888910.0, "step": 417 }, { "epoch": 0.05317389645083323, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.5497207641601562e-06, "ewc_loss_parallel": 2.652406692504883e-06, "grad_norm": 8.829362869262695, "learning_rate": 1.767698177193726e-07, "loss": 0.6048, "mean_token_accuracy": 0.8144852519035339, "num_tokens": 15925604.0, "step": 418 }, { "epoch": 0.05330110672942374, "ewc_loss": 0.00421142578125, "ewc_loss_diag": 1.55717134475708e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.048211097717285, "learning_rate": 1.771937261551505e-07, "loss": 0.5783, "mean_token_accuracy": 0.8270939588546753, "num_tokens": 15961401.0, "step": 419 }, { "epoch": 0.053428317008014245, "ewc_loss": 0.0042724609375, "ewc_loss_diag": 1.564621925354004e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 9.46025562286377, "learning_rate": 1.7761763459092835e-07, "loss": 0.6091, "mean_token_accuracy": 0.8129516243934631, "num_tokens": 16001029.0, "step": 420 }, { "epoch": 0.05355552728660476, "ewc_loss": 0.0042724609375, "ewc_loss_diag": 1.5720725059509277e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 8.957682609558105, "learning_rate": 1.7804154302670624e-07, "loss": 0.5903, "mean_token_accuracy": 0.8197292685508728, "num_tokens": 16036014.0, "step": 421 }, { "epoch": 0.05368273756519527, "ewc_loss": 0.0042724609375, "ewc_loss_diag": 1.5795230865478516e-06, "ewc_loss_parallel": 2.682209014892578e-06, "grad_norm": 8.836299896240234, "learning_rate": 1.784654514624841e-07, "loss": 0.5724, "mean_token_accuracy": 0.8213561773300171, "num_tokens": 16075373.0, "step": 422 }, { "epoch": 0.05380994784378578, "ewc_loss": 0.0042724609375, "ewc_loss_diag": 1.5869736671447754e-06, "ewc_loss_parallel": 2.6673078536987305e-06, "grad_norm": 9.038681030273438, "learning_rate": 1.7888935989826196e-07, "loss": 0.6141, "mean_token_accuracy": 0.8080251216888428, "num_tokens": 16111352.0, "step": 423 }, { "epoch": 0.05393715812237629, "ewc_loss": 0.004302978515625, "ewc_loss_diag": 1.5795230865478516e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 9.463739395141602, "learning_rate": 1.7931326833403984e-07, "loss": 0.631, "mean_token_accuracy": 0.8050740957260132, "num_tokens": 16148826.0, "step": 424 }, { "epoch": 0.054064368400966796, "ewc_loss": 0.00439453125, "ewc_loss_diag": 1.5944242477416992e-06, "ewc_loss_parallel": 2.8014183044433594e-06, "grad_norm": 9.020630836486816, "learning_rate": 1.797371767698177e-07, "loss": 0.5353, "mean_token_accuracy": 0.8356612920761108, "num_tokens": 16193491.0, "step": 425 }, { "epoch": 0.05419157867955731, "ewc_loss": 0.004302978515625, "ewc_loss_diag": 1.5869736671447754e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 9.756930351257324, "learning_rate": 1.8016108520559558e-07, "loss": 0.5885, "mean_token_accuracy": 0.8197755217552185, "num_tokens": 16235595.0, "step": 426 }, { "epoch": 0.05431878895814782, "ewc_loss": 0.004425048828125, "ewc_loss_diag": 1.5869736671447754e-06, "ewc_loss_parallel": 2.8312206268310547e-06, "grad_norm": 9.940361976623535, "learning_rate": 1.8058499364137345e-07, "loss": 0.6432, "mean_token_accuracy": 0.808164119720459, "num_tokens": 16274105.0, "step": 427 }, { "epoch": 0.054445999236738325, "ewc_loss": 0.004425048828125, "ewc_loss_diag": 1.5944242477416992e-06, "ewc_loss_parallel": 2.816319465637207e-06, "grad_norm": 9.287126541137695, "learning_rate": 1.8100890207715133e-07, "loss": 0.6857, "mean_token_accuracy": 0.7863578200340271, "num_tokens": 16315886.0, "step": 428 }, { "epoch": 0.05457320951532884, "ewc_loss": 0.00433349609375, "ewc_loss_diag": 1.601874828338623e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 8.878997802734375, "learning_rate": 1.814328105129292e-07, "loss": 0.6268, "mean_token_accuracy": 0.8061936497688293, "num_tokens": 16353021.0, "step": 429 }, { "epoch": 0.05470041979391935, "ewc_loss": 0.00433349609375, "ewc_loss_diag": 1.6167759895324707e-06, "ewc_loss_parallel": 2.7120113372802734e-06, "grad_norm": 9.755663871765137, "learning_rate": 1.8185671894870707e-07, "loss": 0.6014, "mean_token_accuracy": 0.8117257356643677, "num_tokens": 16380887.0, "step": 430 }, { "epoch": 0.05482763007250986, "ewc_loss": 0.00445556640625, "ewc_loss_diag": 1.6167759895324707e-06, "ewc_loss_parallel": 2.8461217880249023e-06, "grad_norm": 9.000489234924316, "learning_rate": 1.8228062738448494e-07, "loss": 0.6582, "mean_token_accuracy": 0.7996976375579834, "num_tokens": 16421066.0, "step": 431 }, { "epoch": 0.05495484035110037, "ewc_loss": 0.004425048828125, "ewc_loss_diag": 1.6242265701293945e-06, "ewc_loss_parallel": 2.8014183044433594e-06, "grad_norm": 9.42226791381836, "learning_rate": 1.8270453582026282e-07, "loss": 0.6277, "mean_token_accuracy": 0.8067263960838318, "num_tokens": 16457197.0, "step": 432 }, { "epoch": 0.055082050629690876, "ewc_loss": 0.0045166015625, "ewc_loss_diag": 1.6316771507263184e-06, "ewc_loss_parallel": 2.8759241104125977e-06, "grad_norm": 9.11780071258545, "learning_rate": 1.831284442560407e-07, "loss": 0.58, "mean_token_accuracy": 0.8199741840362549, "num_tokens": 16497385.0, "step": 433 }, { "epoch": 0.05520926090828139, "ewc_loss": 0.00445556640625, "ewc_loss_diag": 1.6316771507263184e-06, "ewc_loss_parallel": 2.8312206268310547e-06, "grad_norm": 9.457630157470703, "learning_rate": 1.8355235269181856e-07, "loss": 0.5956, "mean_token_accuracy": 0.8173998594284058, "num_tokens": 16540099.0, "step": 434 }, { "epoch": 0.0553364711868719, "ewc_loss": 0.004486083984375, "ewc_loss_diag": 1.6242265701293945e-06, "ewc_loss_parallel": 2.8759241104125977e-06, "grad_norm": 9.864446640014648, "learning_rate": 1.8397626112759643e-07, "loss": 0.5945, "mean_token_accuracy": 0.8173420429229736, "num_tokens": 16572760.0, "step": 435 }, { "epoch": 0.05546368146546241, "ewc_loss": 0.004608154296875, "ewc_loss_diag": 1.6540288925170898e-06, "ewc_loss_parallel": 2.9653310775756836e-06, "grad_norm": 9.452345848083496, "learning_rate": 1.844001695633743e-07, "loss": 0.5527, "mean_token_accuracy": 0.8289180994033813, "num_tokens": 16608549.0, "step": 436 }, { "epoch": 0.05559089174405292, "ewc_loss": 0.0045166015625, "ewc_loss_diag": 1.6540288925170898e-06, "ewc_loss_parallel": 2.8759241104125977e-06, "grad_norm": 9.700374603271484, "learning_rate": 1.8482407799915218e-07, "loss": 0.592, "mean_token_accuracy": 0.8189395666122437, "num_tokens": 16643800.0, "step": 437 }, { "epoch": 0.05571810202264343, "ewc_loss": 0.0045166015625, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 2.86102294921875e-06, "grad_norm": 10.0416259765625, "learning_rate": 1.8524798643493005e-07, "loss": 0.6643, "mean_token_accuracy": 0.794318437576294, "num_tokens": 16677948.0, "step": 438 }, { "epoch": 0.05584531230123394, "ewc_loss": 0.004638671875, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 2.9802322387695312e-06, "grad_norm": 9.351419448852539, "learning_rate": 1.8567189487070792e-07, "loss": 0.6353, "mean_token_accuracy": 0.806441068649292, "num_tokens": 16716905.0, "step": 439 }, { "epoch": 0.05597252257982445, "ewc_loss": 0.0045166015625, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 2.8908252716064453e-06, "grad_norm": 9.240488052368164, "learning_rate": 1.8609580330648577e-07, "loss": 0.5469, "mean_token_accuracy": 0.8307684659957886, "num_tokens": 16756158.0, "step": 440 }, { "epoch": 0.05609973285841496, "ewc_loss": 0.00457763671875, "ewc_loss_diag": 1.6763806343078613e-06, "ewc_loss_parallel": 2.8908252716064453e-06, "grad_norm": 9.694091796875, "learning_rate": 1.8651971174226367e-07, "loss": 0.6387, "mean_token_accuracy": 0.8068265318870544, "num_tokens": 16794261.0, "step": 441 }, { "epoch": 0.05622694313700547, "ewc_loss": 0.004669189453125, "ewc_loss_diag": 1.6838312149047852e-06, "ewc_loss_parallel": 2.995133399963379e-06, "grad_norm": 8.992998123168945, "learning_rate": 1.8694362017804152e-07, "loss": 0.5954, "mean_token_accuracy": 0.8153626918792725, "num_tokens": 16834953.0, "step": 442 }, { "epoch": 0.05635415341559598, "ewc_loss": 0.004547119140625, "ewc_loss_diag": 1.6838312149047852e-06, "ewc_loss_parallel": 2.8759241104125977e-06, "grad_norm": 9.051018714904785, "learning_rate": 1.8736752861381941e-07, "loss": 0.5538, "mean_token_accuracy": 0.8301376104354858, "num_tokens": 16873954.0, "step": 443 }, { "epoch": 0.05648136369418649, "ewc_loss": 0.004638671875, "ewc_loss_diag": 1.6838312149047852e-06, "ewc_loss_parallel": 2.9653310775756836e-06, "grad_norm": 9.807522773742676, "learning_rate": 1.8779143704959726e-07, "loss": 0.5787, "mean_token_accuracy": 0.8183627128601074, "num_tokens": 16906018.0, "step": 444 }, { "epoch": 0.056608573972777, "ewc_loss": 0.004730224609375, "ewc_loss_diag": 1.6987323760986328e-06, "ewc_loss_parallel": 3.0249357223510742e-06, "grad_norm": 9.54122543334961, "learning_rate": 1.8821534548537516e-07, "loss": 0.633, "mean_token_accuracy": 0.8062446117401123, "num_tokens": 16938945.0, "step": 445 }, { "epoch": 0.05673578425136751, "ewc_loss": 0.00469970703125, "ewc_loss_diag": 1.7061829566955566e-06, "ewc_loss_parallel": 3.0100345611572266e-06, "grad_norm": 9.14517879486084, "learning_rate": 1.88639253921153e-07, "loss": 0.5889, "mean_token_accuracy": 0.8208847641944885, "num_tokens": 16982214.0, "step": 446 }, { "epoch": 0.05686299452995802, "ewc_loss": 0.004638671875, "ewc_loss_diag": 1.7061829566955566e-06, "ewc_loss_parallel": 2.9355287551879883e-06, "grad_norm": 9.397273063659668, "learning_rate": 1.890631623569309e-07, "loss": 0.5971, "mean_token_accuracy": 0.8155835270881653, "num_tokens": 17024367.0, "step": 447 }, { "epoch": 0.05699020480854853, "ewc_loss": 0.004730224609375, "ewc_loss_diag": 1.6987323760986328e-06, "ewc_loss_parallel": 3.0249357223510742e-06, "grad_norm": 9.337089538574219, "learning_rate": 1.8948707079270875e-07, "loss": 0.6207, "mean_token_accuracy": 0.8057700395584106, "num_tokens": 17057451.0, "step": 448 }, { "epoch": 0.057117415087139044, "ewc_loss": 0.00469970703125, "ewc_loss_diag": 1.7061829566955566e-06, "ewc_loss_parallel": 3.0100345611572266e-06, "grad_norm": 10.830193519592285, "learning_rate": 1.8991097922848665e-07, "loss": 0.5728, "mean_token_accuracy": 0.8229221701622009, "num_tokens": 17094633.0, "step": 449 }, { "epoch": 0.05724462536572955, "ewc_loss": 0.00482177734375, "ewc_loss_diag": 1.7061829566955566e-06, "ewc_loss_parallel": 3.11434268951416e-06, "grad_norm": 10.04205322265625, "learning_rate": 1.903348876642645e-07, "loss": 0.6261, "mean_token_accuracy": 0.8067708015441895, "num_tokens": 17135892.0, "step": 450 }, { "epoch": 0.05737183564432006, "ewc_loss": 0.004791259765625, "ewc_loss_diag": 1.7508864402770996e-06, "ewc_loss_parallel": 3.0547380447387695e-06, "grad_norm": 9.135455131530762, "learning_rate": 1.907587961000424e-07, "loss": 0.5674, "mean_token_accuracy": 0.8241199254989624, "num_tokens": 17173881.0, "step": 451 }, { "epoch": 0.05749904592291057, "ewc_loss": 0.004669189453125, "ewc_loss_diag": 1.7285346984863281e-06, "ewc_loss_parallel": 2.9355287551879883e-06, "grad_norm": 9.093423843383789, "learning_rate": 1.9118270453582024e-07, "loss": 0.6179, "mean_token_accuracy": 0.8107091188430786, "num_tokens": 17215668.0, "step": 452 }, { "epoch": 0.05762625620150108, "ewc_loss": 0.004730224609375, "ewc_loss_diag": 1.7285346984863281e-06, "ewc_loss_parallel": 3.0100345611572266e-06, "grad_norm": 9.338827133178711, "learning_rate": 1.9160661297159814e-07, "loss": 0.6432, "mean_token_accuracy": 0.8051385879516602, "num_tokens": 17254339.0, "step": 453 }, { "epoch": 0.057753466480091595, "ewc_loss": 0.004791259765625, "ewc_loss_diag": 1.7285346984863281e-06, "ewc_loss_parallel": 3.069639205932617e-06, "grad_norm": 9.3577299118042, "learning_rate": 1.9203052140737599e-07, "loss": 0.6256, "mean_token_accuracy": 0.805233359336853, "num_tokens": 17288858.0, "step": 454 }, { "epoch": 0.0578806767586821, "ewc_loss": 0.00482177734375, "ewc_loss_diag": 1.735985279083252e-06, "ewc_loss_parallel": 3.069639205932617e-06, "grad_norm": 9.117555618286133, "learning_rate": 1.9245442984315389e-07, "loss": 0.6026, "mean_token_accuracy": 0.8165992498397827, "num_tokens": 17330601.0, "step": 455 }, { "epoch": 0.05800788703727261, "ewc_loss": 0.0047607421875, "ewc_loss_diag": 1.735985279083252e-06, "ewc_loss_parallel": 3.039836883544922e-06, "grad_norm": 9.54269790649414, "learning_rate": 1.9287833827893173e-07, "loss": 0.6215, "mean_token_accuracy": 0.8075653314590454, "num_tokens": 17367797.0, "step": 456 }, { "epoch": 0.058135097315863124, "ewc_loss": 0.00482177734375, "ewc_loss_diag": 1.7657876014709473e-06, "ewc_loss_parallel": 3.069639205932617e-06, "grad_norm": 9.350153923034668, "learning_rate": 1.9330224671470963e-07, "loss": 0.6086, "mean_token_accuracy": 0.8127827644348145, "num_tokens": 17403206.0, "step": 457 }, { "epoch": 0.05826230759445363, "ewc_loss": 0.00482177734375, "ewc_loss_diag": 1.7583370208740234e-06, "ewc_loss_parallel": 3.084540367126465e-06, "grad_norm": 9.266191482543945, "learning_rate": 1.9372615515048748e-07, "loss": 0.6542, "mean_token_accuracy": 0.8035718202590942, "num_tokens": 17437207.0, "step": 458 }, { "epoch": 0.05838951787304414, "ewc_loss": 0.00482177734375, "ewc_loss_diag": 1.7583370208740234e-06, "ewc_loss_parallel": 3.084540367126465e-06, "grad_norm": 9.285449981689453, "learning_rate": 1.9415006358626535e-07, "loss": 0.5903, "mean_token_accuracy": 0.8214917182922363, "num_tokens": 17473335.0, "step": 459 }, { "epoch": 0.05851672815163465, "ewc_loss": 0.0048828125, "ewc_loss_diag": 1.7657876014709473e-06, "ewc_loss_parallel": 3.129243850708008e-06, "grad_norm": 10.08651351928711, "learning_rate": 1.9457397202204322e-07, "loss": 0.58, "mean_token_accuracy": 0.8170336484909058, "num_tokens": 17505104.0, "step": 460 }, { "epoch": 0.05864393843022516, "ewc_loss": 0.00494384765625, "ewc_loss_diag": 1.7657876014709473e-06, "ewc_loss_parallel": 3.1739473342895508e-06, "grad_norm": 9.536384582519531, "learning_rate": 1.949978804578211e-07, "loss": 0.5646, "mean_token_accuracy": 0.8275946974754333, "num_tokens": 17539052.0, "step": 461 }, { "epoch": 0.058771148708815675, "ewc_loss": 0.004913330078125, "ewc_loss_diag": 1.7881393432617188e-06, "ewc_loss_parallel": 3.129243850708008e-06, "grad_norm": 9.335737228393555, "learning_rate": 1.9542178889359897e-07, "loss": 0.6373, "mean_token_accuracy": 0.8036839962005615, "num_tokens": 17578942.0, "step": 462 }, { "epoch": 0.05889835898740618, "ewc_loss": 0.004913330078125, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 3.11434268951416e-06, "grad_norm": 10.024431228637695, "learning_rate": 1.9584569732937684e-07, "loss": 0.6173, "mean_token_accuracy": 0.8124081492424011, "num_tokens": 17611931.0, "step": 463 }, { "epoch": 0.05902556926599669, "ewc_loss": 0.004974365234375, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 3.1888484954833984e-06, "grad_norm": 9.7913236618042, "learning_rate": 1.962696057651547e-07, "loss": 0.5201, "mean_token_accuracy": 0.8390247225761414, "num_tokens": 17648764.0, "step": 464 }, { "epoch": 0.059152779544587204, "ewc_loss": 0.00494384765625, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 3.159046173095703e-06, "grad_norm": 9.778266906738281, "learning_rate": 1.9669351420093258e-07, "loss": 0.6317, "mean_token_accuracy": 0.8067923188209534, "num_tokens": 17683099.0, "step": 465 }, { "epoch": 0.05927998982317771, "ewc_loss": 0.004974365234375, "ewc_loss_diag": 1.8253922462463379e-06, "ewc_loss_parallel": 3.159046173095703e-06, "grad_norm": 9.345198631286621, "learning_rate": 1.9711742263671046e-07, "loss": 0.6158, "mean_token_accuracy": 0.8146716356277466, "num_tokens": 17727120.0, "step": 466 }, { "epoch": 0.059407200101768226, "ewc_loss": 0.004974365234375, "ewc_loss_diag": 1.817941665649414e-06, "ewc_loss_parallel": 3.159046173095703e-06, "grad_norm": 9.583758354187012, "learning_rate": 1.9754133107248833e-07, "loss": 0.572, "mean_token_accuracy": 0.8173158764839172, "num_tokens": 17761676.0, "step": 467 }, { "epoch": 0.059534410380358734, "ewc_loss": 0.0050048828125, "ewc_loss_diag": 1.817941665649414e-06, "ewc_loss_parallel": 3.1888484954833984e-06, "grad_norm": 9.335535049438477, "learning_rate": 1.979652395082662e-07, "loss": 0.6378, "mean_token_accuracy": 0.8058986067771912, "num_tokens": 17802311.0, "step": 468 }, { "epoch": 0.05966162065894924, "ewc_loss": 0.0050048828125, "ewc_loss_diag": 1.8253922462463379e-06, "ewc_loss_parallel": 3.1888484954833984e-06, "grad_norm": 9.024989128112793, "learning_rate": 1.9838914794404408e-07, "loss": 0.5285, "mean_token_accuracy": 0.8329166173934937, "num_tokens": 17839048.0, "step": 469 }, { "epoch": 0.059788830937539755, "ewc_loss": 0.004974365234375, "ewc_loss_diag": 1.8253922462463379e-06, "ewc_loss_parallel": 3.159046173095703e-06, "grad_norm": 9.57983684539795, "learning_rate": 1.9881305637982195e-07, "loss": 0.602, "mean_token_accuracy": 0.8150195479393005, "num_tokens": 17881510.0, "step": 470 }, { "epoch": 0.05991604121613026, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8328428268432617e-06, "ewc_loss_parallel": 3.2186508178710938e-06, "grad_norm": 9.721636772155762, "learning_rate": 1.9923696481559982e-07, "loss": 0.6282, "mean_token_accuracy": 0.8094696402549744, "num_tokens": 17915080.0, "step": 471 }, { "epoch": 0.06004325149472077, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8402934074401855e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 9.8896484375, "learning_rate": 1.996608732513777e-07, "loss": 0.6602, "mean_token_accuracy": 0.799119770526886, "num_tokens": 17957972.0, "step": 472 }, { "epoch": 0.060170461773311285, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8402934074401855e-06, "ewc_loss_parallel": 3.2186508178710938e-06, "grad_norm": 9.28878116607666, "learning_rate": 2.0008478168715557e-07, "loss": 0.5293, "mean_token_accuracy": 0.8328790664672852, "num_tokens": 17998008.0, "step": 473 }, { "epoch": 0.06029767205190179, "ewc_loss": 0.0050048828125, "ewc_loss_diag": 1.8477439880371094e-06, "ewc_loss_parallel": 3.1739473342895508e-06, "grad_norm": 9.331504821777344, "learning_rate": 2.0050869012293344e-07, "loss": 0.5388, "mean_token_accuracy": 0.830407440662384, "num_tokens": 18032427.0, "step": 474 }, { "epoch": 0.060424882330492306, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8402934074401855e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 9.341567993164062, "learning_rate": 2.009325985587113e-07, "loss": 0.5684, "mean_token_accuracy": 0.8226953148841858, "num_tokens": 18069848.0, "step": 475 }, { "epoch": 0.060552092609082814, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8477439880371094e-06, "ewc_loss_parallel": 3.2186508178710938e-06, "grad_norm": 9.363251686096191, "learning_rate": 2.0135650699448918e-07, "loss": 0.6624, "mean_token_accuracy": 0.7971212863922119, "num_tokens": 18109240.0, "step": 476 }, { "epoch": 0.06067930288767332, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 3.2186508178710938e-06, "grad_norm": 9.381781578063965, "learning_rate": 2.0178041543026706e-07, "loss": 0.6329, "mean_token_accuracy": 0.8078576922416687, "num_tokens": 18146849.0, "step": 477 }, { "epoch": 0.060806513166263836, "ewc_loss": 0.00506591796875, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 3.2186508178710938e-06, "grad_norm": 9.399118423461914, "learning_rate": 2.022043238660449e-07, "loss": 0.6638, "mean_token_accuracy": 0.7961681485176086, "num_tokens": 18186872.0, "step": 478 }, { "epoch": 0.06093372344485434, "ewc_loss": 0.005096435546875, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 3.248453140258789e-06, "grad_norm": 9.558821678161621, "learning_rate": 2.026282323018228e-07, "loss": 0.639, "mean_token_accuracy": 0.806020200252533, "num_tokens": 18222299.0, "step": 479 }, { "epoch": 0.06106093372344486, "ewc_loss": 0.005096435546875, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 9.294241905212402, "learning_rate": 2.0305214073760065e-07, "loss": 0.6432, "mean_token_accuracy": 0.8065215349197388, "num_tokens": 18261645.0, "step": 480 }, { "epoch": 0.061188144002035365, "ewc_loss": 0.005096435546875, "ewc_loss_diag": 1.862645149230957e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 10.140767097473145, "learning_rate": 2.0347604917337855e-07, "loss": 0.6137, "mean_token_accuracy": 0.8123607635498047, "num_tokens": 18292970.0, "step": 481 }, { "epoch": 0.06131535428062587, "ewc_loss": 0.005157470703125, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 3.3080577850341797e-06, "grad_norm": 9.563243865966797, "learning_rate": 2.038999576091564e-07, "loss": 0.5594, "mean_token_accuracy": 0.8285411596298218, "num_tokens": 18331133.0, "step": 482 }, { "epoch": 0.06144256455921639, "ewc_loss": 0.005096435546875, "ewc_loss_diag": 1.8700957298278809e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 9.292129516601562, "learning_rate": 2.043238660449343e-07, "loss": 0.6461, "mean_token_accuracy": 0.8020658493041992, "num_tokens": 18368905.0, "step": 483 }, { "epoch": 0.061569774837806894, "ewc_loss": 0.005126953125, "ewc_loss_diag": 1.8775463104248047e-06, "ewc_loss_parallel": 3.2335519790649414e-06, "grad_norm": 9.778733253479004, "learning_rate": 2.0474777448071214e-07, "loss": 0.5713, "mean_token_accuracy": 0.8248660564422607, "num_tokens": 18406362.0, "step": 484 }, { "epoch": 0.0616969851163974, "ewc_loss": 0.00518798828125, "ewc_loss_diag": 1.8849968910217285e-06, "ewc_loss_parallel": 3.293156623840332e-06, "grad_norm": 9.623018264770508, "learning_rate": 2.0517168291649004e-07, "loss": 0.6092, "mean_token_accuracy": 0.8139483332633972, "num_tokens": 18439246.0, "step": 485 }, { "epoch": 0.061824195394987916, "ewc_loss": 0.005157470703125, "ewc_loss_diag": 1.8849968910217285e-06, "ewc_loss_parallel": 3.2782554626464844e-06, "grad_norm": 9.699767112731934, "learning_rate": 2.0559559135226788e-07, "loss": 0.6118, "mean_token_accuracy": 0.8102177381515503, "num_tokens": 18473408.0, "step": 486 }, { "epoch": 0.06195140567357842, "ewc_loss": 0.00518798828125, "ewc_loss_diag": 1.8924474716186523e-06, "ewc_loss_parallel": 3.2782554626464844e-06, "grad_norm": 9.634740829467773, "learning_rate": 2.0601949978804578e-07, "loss": 0.5657, "mean_token_accuracy": 0.8240082263946533, "num_tokens": 18513246.0, "step": 487 }, { "epoch": 0.06207861595216894, "ewc_loss": 0.005157470703125, "ewc_loss_diag": 1.8924474716186523e-06, "ewc_loss_parallel": 3.2633543014526367e-06, "grad_norm": 9.435851097106934, "learning_rate": 2.0644340822382363e-07, "loss": 0.538, "mean_token_accuracy": 0.8315233588218689, "num_tokens": 18549125.0, "step": 488 }, { "epoch": 0.062205826230759445, "ewc_loss": 0.005157470703125, "ewc_loss_diag": 1.8998980522155762e-06, "ewc_loss_parallel": 3.2633543014526367e-06, "grad_norm": 9.236921310424805, "learning_rate": 2.0686731665960153e-07, "loss": 0.5872, "mean_token_accuracy": 0.8240963220596313, "num_tokens": 18585306.0, "step": 489 }, { "epoch": 0.06233303650934995, "ewc_loss": 0.00518798828125, "ewc_loss_diag": 1.9073486328125e-06, "ewc_loss_parallel": 3.2782554626464844e-06, "grad_norm": 9.394901275634766, "learning_rate": 2.0729122509537937e-07, "loss": 0.5479, "mean_token_accuracy": 0.8318822383880615, "num_tokens": 18624378.0, "step": 490 }, { "epoch": 0.06246024678794047, "ewc_loss": 0.005279541015625, "ewc_loss_diag": 1.9222497940063477e-06, "ewc_loss_parallel": 3.3527612686157227e-06, "grad_norm": 9.74244213104248, "learning_rate": 2.0771513353115727e-07, "loss": 0.5923, "mean_token_accuracy": 0.816420316696167, "num_tokens": 18660814.0, "step": 491 }, { "epoch": 0.06258745706653097, "ewc_loss": 0.00531005859375, "ewc_loss_diag": 1.9371509552001953e-06, "ewc_loss_parallel": 3.3676624298095703e-06, "grad_norm": 9.446430206298828, "learning_rate": 2.0813904196693512e-07, "loss": 0.558, "mean_token_accuracy": 0.8285634517669678, "num_tokens": 18702129.0, "step": 492 }, { "epoch": 0.06271466734512149, "ewc_loss": 0.0052490234375, "ewc_loss_diag": 1.9371509552001953e-06, "ewc_loss_parallel": 3.293156623840332e-06, "grad_norm": 10.047844886779785, "learning_rate": 2.0856295040271302e-07, "loss": 0.6325, "mean_token_accuracy": 0.8031420707702637, "num_tokens": 18737496.0, "step": 493 }, { "epoch": 0.06284187762371199, "ewc_loss": 0.00537109375, "ewc_loss_diag": 1.952052116394043e-06, "ewc_loss_parallel": 3.3974647521972656e-06, "grad_norm": 9.300055503845215, "learning_rate": 2.0898685883849086e-07, "loss": 0.587, "mean_token_accuracy": 0.8216124773025513, "num_tokens": 18779332.0, "step": 494 }, { "epoch": 0.0629690879023025, "ewc_loss": 0.005279541015625, "ewc_loss_diag": 1.952052116394043e-06, "ewc_loss_parallel": 3.3229589462280273e-06, "grad_norm": 9.412162780761719, "learning_rate": 2.0941076727426874e-07, "loss": 0.5728, "mean_token_accuracy": 0.8230008482933044, "num_tokens": 18817976.0, "step": 495 }, { "epoch": 0.06309629818089302, "ewc_loss": 0.005340576171875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.3676624298095703e-06, "grad_norm": 9.262269020080566, "learning_rate": 2.098346757100466e-07, "loss": 0.5637, "mean_token_accuracy": 0.8238720893859863, "num_tokens": 18862860.0, "step": 496 }, { "epoch": 0.06322350845948353, "ewc_loss": 0.005340576171875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.3676624298095703e-06, "grad_norm": 9.568196296691895, "learning_rate": 2.1025858414582448e-07, "loss": 0.6061, "mean_token_accuracy": 0.8122659921646118, "num_tokens": 18899025.0, "step": 497 }, { "epoch": 0.06335071873807403, "ewc_loss": 0.00537109375, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4123659133911133e-06, "grad_norm": 9.747293472290039, "learning_rate": 2.1068249258160238e-07, "loss": 0.5726, "mean_token_accuracy": 0.8165119886398315, "num_tokens": 18933118.0, "step": 498 }, { "epoch": 0.06347792901666455, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4421682357788086e-06, "grad_norm": 9.785032272338867, "learning_rate": 2.1110640101738023e-07, "loss": 0.5739, "mean_token_accuracy": 0.8247193098068237, "num_tokens": 18969165.0, "step": 499 }, { "epoch": 0.06360513929525506, "ewc_loss": 0.00537109375, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4123659133911133e-06, "grad_norm": 9.882635116577148, "learning_rate": 2.1153030945315813e-07, "loss": 0.5819, "mean_token_accuracy": 0.818889856338501, "num_tokens": 19003882.0, "step": 500 }, { "epoch": 0.06373234957384556, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4421682357788086e-06, "grad_norm": 9.818955421447754, "learning_rate": 2.1195421788893597e-07, "loss": 0.5821, "mean_token_accuracy": 0.8220664262771606, "num_tokens": 19037540.0, "step": 501 }, { "epoch": 0.06385955985243608, "ewc_loss": 0.005401611328125, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.427267074584961e-06, "grad_norm": 9.540175437927246, "learning_rate": 2.1237812632471387e-07, "loss": 0.5914, "mean_token_accuracy": 0.8127791285514832, "num_tokens": 19077270.0, "step": 502 }, { "epoch": 0.06398677013102659, "ewc_loss": 0.00537109375, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.3974647521972656e-06, "grad_norm": 9.450323104858398, "learning_rate": 2.1280203476049172e-07, "loss": 0.6802, "mean_token_accuracy": 0.7876783609390259, "num_tokens": 19117579.0, "step": 503 }, { "epoch": 0.06411398040961709, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4421682357788086e-06, "grad_norm": 9.470223426818848, "learning_rate": 2.1322594319626962e-07, "loss": 0.62, "mean_token_accuracy": 0.8087124228477478, "num_tokens": 19156599.0, "step": 504 }, { "epoch": 0.0642411906882076, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4421682357788086e-06, "grad_norm": 9.646299362182617, "learning_rate": 2.1364985163204746e-07, "loss": 0.612, "mean_token_accuracy": 0.8127672672271729, "num_tokens": 19187367.0, "step": 505 }, { "epoch": 0.06436840096679812, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4421682357788086e-06, "grad_norm": 9.664778709411621, "learning_rate": 2.1407376006782536e-07, "loss": 0.545, "mean_token_accuracy": 0.8296492695808411, "num_tokens": 19223916.0, "step": 506 }, { "epoch": 0.06449561124538863, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4570693969726562e-06, "grad_norm": 9.553851127624512, "learning_rate": 2.144976685036032e-07, "loss": 0.561, "mean_token_accuracy": 0.8263495564460754, "num_tokens": 19260336.0, "step": 507 }, { "epoch": 0.06462282152397913, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4570693969726562e-06, "grad_norm": 9.824525833129883, "learning_rate": 2.149215769393811e-07, "loss": 0.6257, "mean_token_accuracy": 0.8097680807113647, "num_tokens": 19299895.0, "step": 508 }, { "epoch": 0.06475003180256965, "ewc_loss": 0.005462646484375, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4868717193603516e-06, "grad_norm": 10.348100662231445, "learning_rate": 2.1534548537515895e-07, "loss": 0.5653, "mean_token_accuracy": 0.8241546154022217, "num_tokens": 19336614.0, "step": 509 }, { "epoch": 0.06487724208116016, "ewc_loss": 0.0054931640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5315752029418945e-06, "grad_norm": 9.590261459350586, "learning_rate": 2.1576939381093685e-07, "loss": 0.6151, "mean_token_accuracy": 0.8128787279129028, "num_tokens": 19378683.0, "step": 510 }, { "epoch": 0.06500445235975066, "ewc_loss": 0.00543212890625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4570693969726562e-06, "grad_norm": 9.375368118286133, "learning_rate": 2.161933022467147e-07, "loss": 0.599, "mean_token_accuracy": 0.8130125403404236, "num_tokens": 19422598.0, "step": 511 }, { "epoch": 0.06513166263834118, "ewc_loss": 0.005462646484375, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.4868717193603516e-06, "grad_norm": 9.44062614440918, "learning_rate": 2.166172106824926e-07, "loss": 0.5886, "mean_token_accuracy": 0.8215189576148987, "num_tokens": 19466972.0, "step": 512 }, { "epoch": 0.06525887291693169, "ewc_loss": 0.0054931640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.516674041748047e-06, "grad_norm": 9.466642379760742, "learning_rate": 2.1704111911827044e-07, "loss": 0.5806, "mean_token_accuracy": 0.8170509338378906, "num_tokens": 19503809.0, "step": 513 }, { "epoch": 0.0653860831955222, "ewc_loss": 0.0054931640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.516674041748047e-06, "grad_norm": 9.939876556396484, "learning_rate": 2.1746502755404831e-07, "loss": 0.6356, "mean_token_accuracy": 0.8101887106895447, "num_tokens": 19545713.0, "step": 514 }, { "epoch": 0.06551329347411271, "ewc_loss": 0.00555419921875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5762786865234375e-06, "grad_norm": 9.71277904510498, "learning_rate": 2.178889359898262e-07, "loss": 0.6233, "mean_token_accuracy": 0.8116327524185181, "num_tokens": 19582272.0, "step": 515 }, { "epoch": 0.06564050375270322, "ewc_loss": 0.0054931640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5315752029418945e-06, "grad_norm": 9.3369140625, "learning_rate": 2.1831284442560406e-07, "loss": 0.6379, "mean_token_accuracy": 0.8066627979278564, "num_tokens": 19625574.0, "step": 516 }, { "epoch": 0.06576771403129372, "ewc_loss": 0.0054931640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.546476364135742e-06, "grad_norm": 9.59676742553711, "learning_rate": 2.1873675286138193e-07, "loss": 0.6253, "mean_token_accuracy": 0.8061196804046631, "num_tokens": 19667791.0, "step": 517 }, { "epoch": 0.06589492430988424, "ewc_loss": 0.00555419921875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.591179847717285e-06, "grad_norm": 9.640327453613281, "learning_rate": 2.191606612971598e-07, "loss": 0.6557, "mean_token_accuracy": 0.8005516529083252, "num_tokens": 19708289.0, "step": 518 }, { "epoch": 0.06602213458847475, "ewc_loss": 0.00555419921875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5762786865234375e-06, "grad_norm": 9.737789154052734, "learning_rate": 2.1958456973293768e-07, "loss": 0.6332, "mean_token_accuracy": 0.8017019033432007, "num_tokens": 19747444.0, "step": 519 }, { "epoch": 0.06614934486706527, "ewc_loss": 0.00555419921875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5762786865234375e-06, "grad_norm": 9.463850975036621, "learning_rate": 2.2000847816871555e-07, "loss": 0.5216, "mean_token_accuracy": 0.8339775800704956, "num_tokens": 19787061.0, "step": 520 }, { "epoch": 0.06627655514565577, "ewc_loss": 0.005523681640625, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.56137752532959e-06, "grad_norm": 9.420076370239258, "learning_rate": 2.2043238660449342e-07, "loss": 0.5631, "mean_token_accuracy": 0.8268471956253052, "num_tokens": 19826893.0, "step": 521 }, { "epoch": 0.06640376542424628, "ewc_loss": 0.00555419921875, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 3.5762786865234375e-06, "grad_norm": 9.51524829864502, "learning_rate": 2.208562950402713e-07, "loss": 0.5551, "mean_token_accuracy": 0.8232802152633667, "num_tokens": 19864285.0, "step": 522 }, { "epoch": 0.0665309757028368, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.606081008911133e-06, "grad_norm": 9.69323444366455, "learning_rate": 2.2128020347604917e-07, "loss": 0.591, "mean_token_accuracy": 0.8171798586845398, "num_tokens": 19903806.0, "step": 523 }, { "epoch": 0.0666581859814273, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.6209821701049805e-06, "grad_norm": 9.69442367553711, "learning_rate": 2.2170411191182704e-07, "loss": 0.5681, "mean_token_accuracy": 0.8225791454315186, "num_tokens": 19950382.0, "step": 524 }, { "epoch": 0.06678539626001781, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.606081008911133e-06, "grad_norm": 9.672694206237793, "learning_rate": 2.221280203476049e-07, "loss": 0.5922, "mean_token_accuracy": 0.8147611618041992, "num_tokens": 19984765.0, "step": 525 }, { "epoch": 0.06691260653860832, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 3.6209821701049805e-06, "grad_norm": 9.81057357788086, "learning_rate": 2.2255192878338279e-07, "loss": 0.6329, "mean_token_accuracy": 0.8079673647880554, "num_tokens": 20021081.0, "step": 526 }, { "epoch": 0.06703981681719882, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.606081008911133e-06, "grad_norm": 10.002920150756836, "learning_rate": 2.2297583721916066e-07, "loss": 0.6076, "mean_token_accuracy": 0.8126838207244873, "num_tokens": 20055860.0, "step": 527 }, { "epoch": 0.06716702709578934, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.635883331298828e-06, "grad_norm": 9.794754028320312, "learning_rate": 2.2339974565493853e-07, "loss": 0.5604, "mean_token_accuracy": 0.8288071155548096, "num_tokens": 20095023.0, "step": 528 }, { "epoch": 0.06729423737437985, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.606081008911133e-06, "grad_norm": 9.687172889709473, "learning_rate": 2.238236540907164e-07, "loss": 0.6151, "mean_token_accuracy": 0.8116825222969055, "num_tokens": 20134620.0, "step": 529 }, { "epoch": 0.06742144765297035, "ewc_loss": 0.005615234375, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 3.606081008911133e-06, "grad_norm": 9.912405967712402, "learning_rate": 2.2424756252649428e-07, "loss": 0.5879, "mean_token_accuracy": 0.8156979084014893, "num_tokens": 20169690.0, "step": 530 }, { "epoch": 0.06754865793156087, "ewc_loss": 0.005615234375, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 3.6209821701049805e-06, "grad_norm": 9.945889472961426, "learning_rate": 2.2467147096227215e-07, "loss": 0.5797, "mean_token_accuracy": 0.8207337856292725, "num_tokens": 20203609.0, "step": 531 }, { "epoch": 0.06767586821015138, "ewc_loss": 0.00567626953125, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 3.6507844924926758e-06, "grad_norm": 9.881500244140625, "learning_rate": 2.2509537939805002e-07, "loss": 0.5866, "mean_token_accuracy": 0.8166300058364868, "num_tokens": 20234616.0, "step": 532 }, { "epoch": 0.0678030784887419, "ewc_loss": 0.00567626953125, "ewc_loss_diag": 2.0265579223632812e-06, "ewc_loss_parallel": 3.6507844924926758e-06, "grad_norm": 9.41101360321045, "learning_rate": 2.2551928783382787e-07, "loss": 0.5743, "mean_token_accuracy": 0.8209633827209473, "num_tokens": 20271634.0, "step": 533 }, { "epoch": 0.0679302887673324, "ewc_loss": 0.00567626953125, "ewc_loss_diag": 2.041459083557129e-06, "ewc_loss_parallel": 3.635883331298828e-06, "grad_norm": 9.83426570892334, "learning_rate": 2.2594319626960577e-07, "loss": 0.6031, "mean_token_accuracy": 0.8168992400169373, "num_tokens": 20309683.0, "step": 534 }, { "epoch": 0.06805749904592291, "ewc_loss": 0.00579833984375, "ewc_loss_diag": 2.041459083557129e-06, "ewc_loss_parallel": 3.7401914596557617e-06, "grad_norm": 9.587882041931152, "learning_rate": 2.263671047053836e-07, "loss": 0.5167, "mean_token_accuracy": 0.8339963555335999, "num_tokens": 20346013.0, "step": 535 }, { "epoch": 0.06818470932451343, "ewc_loss": 0.0057373046875, "ewc_loss_diag": 2.041459083557129e-06, "ewc_loss_parallel": 3.680586814880371e-06, "grad_norm": 9.735937118530273, "learning_rate": 2.267910131411615e-07, "loss": 0.5752, "mean_token_accuracy": 0.8245857954025269, "num_tokens": 20387613.0, "step": 536 }, { "epoch": 0.06831191960310393, "ewc_loss": 0.00579833984375, "ewc_loss_diag": 2.0712614059448242e-06, "ewc_loss_parallel": 3.725290298461914e-06, "grad_norm": 10.109315872192383, "learning_rate": 2.2721492157693936e-07, "loss": 0.5747, "mean_token_accuracy": 0.8210064172744751, "num_tokens": 20417579.0, "step": 537 }, { "epoch": 0.06843912988169444, "ewc_loss": 0.005889892578125, "ewc_loss_diag": 2.0712614059448242e-06, "ewc_loss_parallel": 3.814697265625e-06, "grad_norm": 9.804459571838379, "learning_rate": 2.2763883001271726e-07, "loss": 0.5464, "mean_token_accuracy": 0.8292621970176697, "num_tokens": 20455100.0, "step": 538 }, { "epoch": 0.06856634016028496, "ewc_loss": 0.00579833984375, "ewc_loss_diag": 2.0563602447509766e-06, "ewc_loss_parallel": 3.7550926208496094e-06, "grad_norm": 9.596105575561523, "learning_rate": 2.280627384484951e-07, "loss": 0.6091, "mean_token_accuracy": 0.808890700340271, "num_tokens": 20496458.0, "step": 539 }, { "epoch": 0.06869355043887546, "ewc_loss": 0.00579833984375, "ewc_loss_diag": 2.0563602447509766e-06, "ewc_loss_parallel": 3.7401914596557617e-06, "grad_norm": 9.787989616394043, "learning_rate": 2.28486646884273e-07, "loss": 0.582, "mean_token_accuracy": 0.818739652633667, "num_tokens": 20533758.0, "step": 540 }, { "epoch": 0.06882076071746597, "ewc_loss": 0.005950927734375, "ewc_loss_diag": 2.0712614059448242e-06, "ewc_loss_parallel": 3.874301910400391e-06, "grad_norm": 9.806029319763184, "learning_rate": 2.2891055532005085e-07, "loss": 0.6277, "mean_token_accuracy": 0.8101136684417725, "num_tokens": 20567259.0, "step": 541 }, { "epoch": 0.06894797099605648, "ewc_loss": 0.00592041015625, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 3.814697265625e-06, "grad_norm": 9.885204315185547, "learning_rate": 2.2933446375582875e-07, "loss": 0.6088, "mean_token_accuracy": 0.8150622844696045, "num_tokens": 20599333.0, "step": 542 }, { "epoch": 0.06907518127464699, "ewc_loss": 0.00592041015625, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 3.814697265625e-06, "grad_norm": 9.70207405090332, "learning_rate": 2.297583721916066e-07, "loss": 0.5444, "mean_token_accuracy": 0.8319774866104126, "num_tokens": 20640195.0, "step": 543 }, { "epoch": 0.0692023915532375, "ewc_loss": 0.0059814453125, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 3.874301910400391e-06, "grad_norm": 9.624871253967285, "learning_rate": 2.301822806273845e-07, "loss": 0.5657, "mean_token_accuracy": 0.8243608474731445, "num_tokens": 20682617.0, "step": 544 }, { "epoch": 0.06932960183182801, "ewc_loss": 0.00592041015625, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 3.844499588012695e-06, "grad_norm": 10.378378868103027, "learning_rate": 2.3060618906316234e-07, "loss": 0.6191, "mean_token_accuracy": 0.8109472393989563, "num_tokens": 20718032.0, "step": 545 }, { "epoch": 0.06945681211041853, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 10.242538452148438, "learning_rate": 2.3103009749894024e-07, "loss": 0.582, "mean_token_accuracy": 0.8203479051589966, "num_tokens": 20755190.0, "step": 546 }, { "epoch": 0.06958402238900903, "ewc_loss": 0.006011962890625, "ewc_loss_diag": 2.1010637283325195e-06, "ewc_loss_parallel": 3.904104232788086e-06, "grad_norm": 9.491110801696777, "learning_rate": 2.3145400593471808e-07, "loss": 0.608, "mean_token_accuracy": 0.8119632005691528, "num_tokens": 20799088.0, "step": 547 }, { "epoch": 0.06971123266759954, "ewc_loss": 0.00592041015625, "ewc_loss_diag": 2.1010637283325195e-06, "ewc_loss_parallel": 3.7997961044311523e-06, "grad_norm": 10.019765853881836, "learning_rate": 2.3187791437049598e-07, "loss": 0.6403, "mean_token_accuracy": 0.8011938333511353, "num_tokens": 20831819.0, "step": 548 }, { "epoch": 0.06983844294619006, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.834185600280762, "learning_rate": 2.3230182280627383e-07, "loss": 0.5694, "mean_token_accuracy": 0.8248998522758484, "num_tokens": 20861291.0, "step": 549 }, { "epoch": 0.06996565322478056, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.1010637283325195e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.7590913772583, "learning_rate": 2.327257312420517e-07, "loss": 0.5447, "mean_token_accuracy": 0.8331290483474731, "num_tokens": 20897446.0, "step": 550 }, { "epoch": 0.07009286350337107, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.904104232788086e-06, "grad_norm": 9.803975105285645, "learning_rate": 2.3314963967782957e-07, "loss": 0.5682, "mean_token_accuracy": 0.8199253082275391, "num_tokens": 20934423.0, "step": 551 }, { "epoch": 0.07022007378196159, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.904104232788086e-06, "grad_norm": 10.0169095993042, "learning_rate": 2.3357354811360745e-07, "loss": 0.5848, "mean_token_accuracy": 0.8212739825248718, "num_tokens": 20978394.0, "step": 552 }, { "epoch": 0.07034728406055209, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.991460800170898, "learning_rate": 2.3399745654938532e-07, "loss": 0.6127, "mean_token_accuracy": 0.8124918341636658, "num_tokens": 21010669.0, "step": 553 }, { "epoch": 0.0704744943391426, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.661580085754395, "learning_rate": 2.344213649851632e-07, "loss": 0.5803, "mean_token_accuracy": 0.8222348690032959, "num_tokens": 21047696.0, "step": 554 }, { "epoch": 0.07060170461773312, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.904104232788086e-06, "grad_norm": 9.670389175415039, "learning_rate": 2.3484527342094106e-07, "loss": 0.6621, "mean_token_accuracy": 0.7989246249198914, "num_tokens": 21091437.0, "step": 555 }, { "epoch": 0.07072891489632362, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.904104232788086e-06, "grad_norm": 9.57279109954834, "learning_rate": 2.3526918185671894e-07, "loss": 0.6226, "mean_token_accuracy": 0.8083880543708801, "num_tokens": 21128880.0, "step": 556 }, { "epoch": 0.07085612517491413, "ewc_loss": 0.00604248046875, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.746590614318848, "learning_rate": 2.356930902924968e-07, "loss": 0.5752, "mean_token_accuracy": 0.8235883712768555, "num_tokens": 21168523.0, "step": 557 }, { "epoch": 0.07098333545350465, "ewc_loss": 0.006072998046875, "ewc_loss_diag": 2.130866050720215e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.789083480834961, "learning_rate": 2.3611699872827468e-07, "loss": 0.6626, "mean_token_accuracy": 0.7966096997261047, "num_tokens": 21211376.0, "step": 558 }, { "epoch": 0.07111054573209516, "ewc_loss": 0.006103515625, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.831239700317383, "learning_rate": 2.3654090716405255e-07, "loss": 0.5925, "mean_token_accuracy": 0.8173473477363586, "num_tokens": 21249162.0, "step": 559 }, { "epoch": 0.07123775601068566, "ewc_loss": 0.006103515625, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.672728538513184, "learning_rate": 2.3696481559983043e-07, "loss": 0.6215, "mean_token_accuracy": 0.8074449300765991, "num_tokens": 21282351.0, "step": 560 }, { "epoch": 0.07136496628927617, "ewc_loss": 0.006103515625, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 10.230630874633789, "learning_rate": 2.373887240356083e-07, "loss": 0.6106, "mean_token_accuracy": 0.8110557198524475, "num_tokens": 21319508.0, "step": 561 }, { "epoch": 0.07149217656786669, "ewc_loss": 0.006195068359375, "ewc_loss_diag": 2.16066837310791e-06, "ewc_loss_parallel": 4.023313522338867e-06, "grad_norm": 10.04974365234375, "learning_rate": 2.3781263247138617e-07, "loss": 0.5754, "mean_token_accuracy": 0.823478102684021, "num_tokens": 21357111.0, "step": 562 }, { "epoch": 0.07161938684645719, "ewc_loss": 0.006103515625, "ewc_loss_diag": 2.16066837310791e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.83903694152832, "learning_rate": 2.3823654090716404e-07, "loss": 0.5774, "mean_token_accuracy": 0.8193584680557251, "num_tokens": 21393222.0, "step": 563 }, { "epoch": 0.0717465971250477, "ewc_loss": 0.006103515625, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 3.933906555175781e-06, "grad_norm": 9.737123489379883, "learning_rate": 2.386604493429419e-07, "loss": 0.5322, "mean_token_accuracy": 0.8322763442993164, "num_tokens": 21429595.0, "step": 564 }, { "epoch": 0.07187380740363822, "ewc_loss": 0.00616455078125, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 3.9637088775634766e-06, "grad_norm": 9.824590682983398, "learning_rate": 2.390843577787198e-07, "loss": 0.6281, "mean_token_accuracy": 0.8034713864326477, "num_tokens": 21467341.0, "step": 565 }, { "epoch": 0.07200101768222872, "ewc_loss": 0.00616455078125, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 3.993511199951172e-06, "grad_norm": 9.661853790283203, "learning_rate": 2.3950826621449766e-07, "loss": 0.5037, "mean_token_accuracy": 0.8449063897132874, "num_tokens": 21510927.0, "step": 566 }, { "epoch": 0.07212822796081923, "ewc_loss": 0.00616455078125, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 3.993511199951172e-06, "grad_norm": 10.13613510131836, "learning_rate": 2.3993217465027556e-07, "loss": 0.5805, "mean_token_accuracy": 0.8215308785438538, "num_tokens": 21551604.0, "step": 567 }, { "epoch": 0.07225543823940975, "ewc_loss": 0.0062255859375, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 4.023313522338867e-06, "grad_norm": 10.124656677246094, "learning_rate": 2.403560830860534e-07, "loss": 0.6316, "mean_token_accuracy": 0.8098350763320923, "num_tokens": 21591871.0, "step": 568 }, { "epoch": 0.07238264851800025, "ewc_loss": 0.0062255859375, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 4.023313522338867e-06, "grad_norm": 9.820836067199707, "learning_rate": 2.4077999152183125e-07, "loss": 0.6284, "mean_token_accuracy": 0.8060309886932373, "num_tokens": 21629810.0, "step": 569 }, { "epoch": 0.07250985879659076, "ewc_loss": 0.006195068359375, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 3.993511199951172e-06, "grad_norm": 9.90116024017334, "learning_rate": 2.4120389995760915e-07, "loss": 0.527, "mean_token_accuracy": 0.8398044109344482, "num_tokens": 21668706.0, "step": 570 }, { "epoch": 0.07263706907518128, "ewc_loss": 0.00628662109375, "ewc_loss_diag": 2.175569534301758e-06, "ewc_loss_parallel": 4.082918167114258e-06, "grad_norm": 10.042040824890137, "learning_rate": 2.41627808393387e-07, "loss": 0.5728, "mean_token_accuracy": 0.8210462927818298, "num_tokens": 21712725.0, "step": 571 }, { "epoch": 0.07276427935377179, "ewc_loss": 0.00634765625, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 4.1425228118896484e-06, "grad_norm": 9.947761535644531, "learning_rate": 2.420517168291649e-07, "loss": 0.5812, "mean_token_accuracy": 0.8220089673995972, "num_tokens": 21746086.0, "step": 572 }, { "epoch": 0.07289148963236229, "ewc_loss": 0.00628662109375, "ewc_loss_diag": 2.205371856689453e-06, "ewc_loss_parallel": 4.082918167114258e-06, "grad_norm": 9.827630996704102, "learning_rate": 2.4247562526494274e-07, "loss": 0.5414, "mean_token_accuracy": 0.829598069190979, "num_tokens": 21779573.0, "step": 573 }, { "epoch": 0.0730186999109528, "ewc_loss": 0.00628662109375, "ewc_loss_diag": 2.205371856689453e-06, "ewc_loss_parallel": 4.082918167114258e-06, "grad_norm": 9.978434562683105, "learning_rate": 2.4289953370072064e-07, "loss": 0.6143, "mean_token_accuracy": 0.8104214668273926, "num_tokens": 21818766.0, "step": 574 }, { "epoch": 0.07314591018954332, "ewc_loss": 0.006317138671875, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 4.112720489501953e-06, "grad_norm": 9.807023048400879, "learning_rate": 2.433234421364985e-07, "loss": 0.5705, "mean_token_accuracy": 0.8271840810775757, "num_tokens": 21855539.0, "step": 575 }, { "epoch": 0.07327312046813382, "ewc_loss": 0.00634765625, "ewc_loss_diag": 2.205371856689453e-06, "ewc_loss_parallel": 4.112720489501953e-06, "grad_norm": 9.929441452026367, "learning_rate": 2.437473505722764e-07, "loss": 0.5335, "mean_token_accuracy": 0.833280622959137, "num_tokens": 21892567.0, "step": 576 }, { "epoch": 0.07340033074672433, "ewc_loss": 0.00634765625, "ewc_loss_diag": 2.205371856689453e-06, "ewc_loss_parallel": 4.112720489501953e-06, "grad_norm": 9.800004959106445, "learning_rate": 2.4417125900805423e-07, "loss": 0.5531, "mean_token_accuracy": 0.8219626545906067, "num_tokens": 21928607.0, "step": 577 }, { "epoch": 0.07352754102531485, "ewc_loss": 0.00634765625, "ewc_loss_diag": 2.2202730178833008e-06, "ewc_loss_parallel": 4.112720489501953e-06, "grad_norm": 9.855555534362793, "learning_rate": 2.4459516744383213e-07, "loss": 0.601, "mean_token_accuracy": 0.8150548338890076, "num_tokens": 21963781.0, "step": 578 }, { "epoch": 0.07365475130390535, "ewc_loss": 0.00640869140625, "ewc_loss_diag": 2.2351741790771484e-06, "ewc_loss_parallel": 4.172325134277344e-06, "grad_norm": 9.93221664428711, "learning_rate": 2.4501907587961e-07, "loss": 0.5612, "mean_token_accuracy": 0.8265581130981445, "num_tokens": 21999140.0, "step": 579 }, { "epoch": 0.07378196158249586, "ewc_loss": 0.00640869140625, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 4.1425228118896484e-06, "grad_norm": 9.849748611450195, "learning_rate": 2.454429843153879e-07, "loss": 0.6617, "mean_token_accuracy": 0.7969406843185425, "num_tokens": 22038703.0, "step": 580 }, { "epoch": 0.07390917186108638, "ewc_loss": 0.00640869140625, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 4.172325134277344e-06, "grad_norm": 10.129508018493652, "learning_rate": 2.458668927511657e-07, "loss": 0.5747, "mean_token_accuracy": 0.8135624527931213, "num_tokens": 22071052.0, "step": 581 }, { "epoch": 0.07403638213967688, "ewc_loss": 0.0064697265625, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 10.054374694824219, "learning_rate": 2.462908011869436e-07, "loss": 0.5392, "mean_token_accuracy": 0.8323644995689392, "num_tokens": 22109629.0, "step": 582 }, { "epoch": 0.0741635924182674, "ewc_loss": 0.00653076171875, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 9.974336624145508, "learning_rate": 2.4671470962272147e-07, "loss": 0.5525, "mean_token_accuracy": 0.8294460773468018, "num_tokens": 22151544.0, "step": 583 }, { "epoch": 0.07429080269685791, "ewc_loss": 0.006500244140625, "ewc_loss_diag": 2.294778823852539e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 10.019804954528809, "learning_rate": 2.4713861805849937e-07, "loss": 0.5301, "mean_token_accuracy": 0.8321850299835205, "num_tokens": 22189241.0, "step": 584 }, { "epoch": 0.07441801297544842, "ewc_loss": 0.00653076171875, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 10.132616996765137, "learning_rate": 2.475625264942772e-07, "loss": 0.5457, "mean_token_accuracy": 0.8291467428207397, "num_tokens": 22229138.0, "step": 585 }, { "epoch": 0.07454522325403892, "ewc_loss": 0.006591796875, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 4.26173210144043e-06, "grad_norm": 9.882883071899414, "learning_rate": 2.479864349300551e-07, "loss": 0.5848, "mean_token_accuracy": 0.8198184967041016, "num_tokens": 22264930.0, "step": 586 }, { "epoch": 0.07467243353262944, "ewc_loss": 0.0064697265625, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 4.172325134277344e-06, "grad_norm": 10.043253898620605, "learning_rate": 2.4841034336583296e-07, "loss": 0.525, "mean_token_accuracy": 0.8302148580551147, "num_tokens": 22297162.0, "step": 587 }, { "epoch": 0.07479964381121995, "ewc_loss": 0.00653076171875, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 9.971275329589844, "learning_rate": 2.488342518016108e-07, "loss": 0.6108, "mean_token_accuracy": 0.8100457191467285, "num_tokens": 22334779.0, "step": 588 }, { "epoch": 0.07492685408981045, "ewc_loss": 0.006561279296875, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.231929779052734e-06, "grad_norm": 9.918607711791992, "learning_rate": 2.492581602373887e-07, "loss": 0.5658, "mean_token_accuracy": 0.8255925178527832, "num_tokens": 22372820.0, "step": 589 }, { "epoch": 0.07505406436840097, "ewc_loss": 0.00653076171875, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 10.067837715148926, "learning_rate": 2.4968206867316655e-07, "loss": 0.5873, "mean_token_accuracy": 0.8182209730148315, "num_tokens": 22417841.0, "step": 590 }, { "epoch": 0.07518127464699148, "ewc_loss": 0.006591796875, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.26173210144043e-06, "grad_norm": 10.054758071899414, "learning_rate": 2.5010597710894445e-07, "loss": 0.5013, "mean_token_accuracy": 0.843623161315918, "num_tokens": 22449133.0, "step": 591 }, { "epoch": 0.07530848492558198, "ewc_loss": 0.006591796875, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.26173210144043e-06, "grad_norm": 10.149433135986328, "learning_rate": 2.505298855447223e-07, "loss": 0.5661, "mean_token_accuracy": 0.8249332904815674, "num_tokens": 22491145.0, "step": 592 }, { "epoch": 0.0754356952041725, "ewc_loss": 0.006622314453125, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.291534423828125e-06, "grad_norm": 10.054128646850586, "learning_rate": 2.509537939805002e-07, "loss": 0.5792, "mean_token_accuracy": 0.8236143589019775, "num_tokens": 22531738.0, "step": 593 }, { "epoch": 0.07556290548276301, "ewc_loss": 0.006622314453125, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.291534423828125e-06, "grad_norm": 10.616945266723633, "learning_rate": 2.513777024162781e-07, "loss": 0.5586, "mean_token_accuracy": 0.828522801399231, "num_tokens": 22563005.0, "step": 594 }, { "epoch": 0.07569011576135352, "ewc_loss": 0.00665283203125, "ewc_loss_diag": 2.3245811462402344e-06, "ewc_loss_parallel": 4.32133674621582e-06, "grad_norm": 9.967446327209473, "learning_rate": 2.5180161085205594e-07, "loss": 0.5384, "mean_token_accuracy": 0.8339316844940186, "num_tokens": 22601333.0, "step": 595 }, { "epoch": 0.07581732603994402, "ewc_loss": 0.00653076171875, "ewc_loss_diag": 2.339482307434082e-06, "ewc_loss_parallel": 4.202127456665039e-06, "grad_norm": 10.19503116607666, "learning_rate": 2.522255192878338e-07, "loss": 0.5951, "mean_token_accuracy": 0.8161745071411133, "num_tokens": 22639528.0, "step": 596 }, { "epoch": 0.07594453631853454, "ewc_loss": 0.0067138671875, "ewc_loss_diag": 2.339482307434082e-06, "ewc_loss_parallel": 4.351139068603516e-06, "grad_norm": 10.084441184997559, "learning_rate": 2.526494277236117e-07, "loss": 0.5772, "mean_token_accuracy": 0.8212188482284546, "num_tokens": 22675290.0, "step": 597 }, { "epoch": 0.07607174659712505, "ewc_loss": 0.0067138671875, "ewc_loss_diag": 2.3543834686279297e-06, "ewc_loss_parallel": 4.351139068603516e-06, "grad_norm": 10.21801471710205, "learning_rate": 2.530733361593896e-07, "loss": 0.5515, "mean_token_accuracy": 0.8264105319976807, "num_tokens": 22712823.0, "step": 598 }, { "epoch": 0.07619895687571555, "ewc_loss": 0.006683349609375, "ewc_loss_diag": 2.3543834686279297e-06, "ewc_loss_parallel": 4.32133674621582e-06, "grad_norm": 9.848021507263184, "learning_rate": 2.5349724459516743e-07, "loss": 0.5718, "mean_token_accuracy": 0.8237340450286865, "num_tokens": 22750916.0, "step": 599 }, { "epoch": 0.07632616715430607, "ewc_loss": 0.0067138671875, "ewc_loss_diag": 2.3692846298217773e-06, "ewc_loss_parallel": 4.351139068603516e-06, "grad_norm": 10.183572769165039, "learning_rate": 2.539211530309453e-07, "loss": 0.5243, "mean_token_accuracy": 0.8345251083374023, "num_tokens": 22782537.0, "step": 600 }, { "epoch": 0.07645337743289658, "ewc_loss": 0.00677490234375, "ewc_loss_diag": 2.3990869522094727e-06, "ewc_loss_parallel": 4.351139068603516e-06, "grad_norm": 10.152090072631836, "learning_rate": 2.543450614667232e-07, "loss": 0.617, "mean_token_accuracy": 0.8111013770103455, "num_tokens": 22814805.0, "step": 601 }, { "epoch": 0.07658058771148708, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.4139881134033203e-06, "ewc_loss_parallel": 4.410743713378906e-06, "grad_norm": 10.245315551757812, "learning_rate": 2.547689699025011e-07, "loss": 0.6147, "mean_token_accuracy": 0.8102676868438721, "num_tokens": 22852654.0, "step": 602 }, { "epoch": 0.0767077979900776, "ewc_loss": 0.006805419921875, "ewc_loss_diag": 2.4139881134033203e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.250577926635742, "learning_rate": 2.551928783382789e-07, "loss": 0.5705, "mean_token_accuracy": 0.8226281404495239, "num_tokens": 22894205.0, "step": 603 }, { "epoch": 0.07683500826866811, "ewc_loss": 0.006805419921875, "ewc_loss_diag": 2.4139881134033203e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.214634895324707, "learning_rate": 2.5561678677405677e-07, "loss": 0.5366, "mean_token_accuracy": 0.8295962810516357, "num_tokens": 22924095.0, "step": 604 }, { "epoch": 0.07696221854725861, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.4139881134033203e-06, "ewc_loss_parallel": 4.410743713378906e-06, "grad_norm": 10.219124794006348, "learning_rate": 2.5604069520983467e-07, "loss": 0.619, "mean_token_accuracy": 0.8056999444961548, "num_tokens": 22960857.0, "step": 605 }, { "epoch": 0.07708942882584913, "ewc_loss": 0.006805419921875, "ewc_loss_diag": 2.4139881134033203e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.129027366638184, "learning_rate": 2.564646036456125e-07, "loss": 0.5694, "mean_token_accuracy": 0.8213874101638794, "num_tokens": 23004670.0, "step": 606 }, { "epoch": 0.07721663910443964, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.264719009399414, "learning_rate": 2.568885120813904e-07, "loss": 0.5698, "mean_token_accuracy": 0.8188278675079346, "num_tokens": 23041999.0, "step": 607 }, { "epoch": 0.07734384938303016, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.20521354675293, "learning_rate": 2.5731242051716826e-07, "loss": 0.5593, "mean_token_accuracy": 0.827670156955719, "num_tokens": 23080704.0, "step": 608 }, { "epoch": 0.07747105966162066, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.410743713378906e-06, "grad_norm": 10.118932723999023, "learning_rate": 2.5773632895294616e-07, "loss": 0.5937, "mean_token_accuracy": 0.8169207572937012, "num_tokens": 23118662.0, "step": 609 }, { "epoch": 0.07759826994021117, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.241191864013672, "learning_rate": 2.58160237388724e-07, "loss": 0.5176, "mean_token_accuracy": 0.8342201709747314, "num_tokens": 23150367.0, "step": 610 }, { "epoch": 0.07772548021880168, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.331302642822266, "learning_rate": 2.585841458245019e-07, "loss": 0.5748, "mean_token_accuracy": 0.8205608129501343, "num_tokens": 23185836.0, "step": 611 }, { "epoch": 0.07785269049739219, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.23009967803955, "learning_rate": 2.5900805426027975e-07, "loss": 0.5146, "mean_token_accuracy": 0.8383476734161377, "num_tokens": 23219346.0, "step": 612 }, { "epoch": 0.0779799007759827, "ewc_loss": 0.0068359375, "ewc_loss_diag": 2.4437904357910156e-06, "ewc_loss_parallel": 4.380941390991211e-06, "grad_norm": 10.189179420471191, "learning_rate": 2.5943196269605765e-07, "loss": 0.5877, "mean_token_accuracy": 0.8232532739639282, "num_tokens": 23257569.0, "step": 613 }, { "epoch": 0.07810711105457321, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.252129554748535, "learning_rate": 2.598558711318355e-07, "loss": 0.5768, "mean_token_accuracy": 0.823267936706543, "num_tokens": 23302944.0, "step": 614 }, { "epoch": 0.07823432133316371, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.554475784301758, "learning_rate": 2.602797795676134e-07, "loss": 0.5821, "mean_token_accuracy": 0.8202987909317017, "num_tokens": 23341660.0, "step": 615 }, { "epoch": 0.07836153161175423, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.270980834960938, "learning_rate": 2.6070368800339124e-07, "loss": 0.5819, "mean_token_accuracy": 0.8141201138496399, "num_tokens": 23375956.0, "step": 616 }, { "epoch": 0.07848874189034474, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.410743713378906e-06, "grad_norm": 10.317736625671387, "learning_rate": 2.6112759643916914e-07, "loss": 0.534, "mean_token_accuracy": 0.8319150805473328, "num_tokens": 23407568.0, "step": 617 }, { "epoch": 0.07861595216893524, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.599209785461426, "learning_rate": 2.61551504874947e-07, "loss": 0.568, "mean_token_accuracy": 0.8240118026733398, "num_tokens": 23440261.0, "step": 618 }, { "epoch": 0.07874316244752576, "ewc_loss": 0.006927490234375, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.290003776550293, "learning_rate": 2.619754133107249e-07, "loss": 0.5477, "mean_token_accuracy": 0.8299436569213867, "num_tokens": 23481706.0, "step": 619 }, { "epoch": 0.07887037272611627, "ewc_loss": 0.00689697265625, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.494327545166016, "learning_rate": 2.623993217465028e-07, "loss": 0.6088, "mean_token_accuracy": 0.8071194291114807, "num_tokens": 23515410.0, "step": 620 }, { "epoch": 0.07899758300470679, "ewc_loss": 0.006927490234375, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 4.470348358154297e-06, "grad_norm": 10.216169357299805, "learning_rate": 2.6282323018228063e-07, "loss": 0.5942, "mean_token_accuracy": 0.8200987577438354, "num_tokens": 23554627.0, "step": 621 }, { "epoch": 0.07912479328329729, "ewc_loss": 0.006927490234375, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.304264068603516, "learning_rate": 2.632471386180585e-07, "loss": 0.6158, "mean_token_accuracy": 0.8087612390518188, "num_tokens": 23593400.0, "step": 622 }, { "epoch": 0.0792520035618878, "ewc_loss": 0.0069580078125, "ewc_loss_diag": 2.4884939193725586e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.123854637145996, "learning_rate": 2.6367104705383637e-07, "loss": 0.6136, "mean_token_accuracy": 0.8081263899803162, "num_tokens": 23628242.0, "step": 623 }, { "epoch": 0.07937921384047832, "ewc_loss": 0.0069580078125, "ewc_loss_diag": 2.4884939193725586e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.290599822998047, "learning_rate": 2.6409495548961427e-07, "loss": 0.5994, "mean_token_accuracy": 0.8189444541931152, "num_tokens": 23671041.0, "step": 624 }, { "epoch": 0.07950642411906882, "ewc_loss": 0.0069580078125, "ewc_loss_diag": 2.4884939193725586e-06, "ewc_loss_parallel": 4.470348358154297e-06, "grad_norm": 10.687385559082031, "learning_rate": 2.6451886392539206e-07, "loss": 0.5785, "mean_token_accuracy": 0.823729395866394, "num_tokens": 23701904.0, "step": 625 }, { "epoch": 0.07963363439765933, "ewc_loss": 0.007049560546875, "ewc_loss_diag": 2.5033950805664062e-06, "ewc_loss_parallel": 4.559755325317383e-06, "grad_norm": 10.371176719665527, "learning_rate": 2.6494277236116996e-07, "loss": 0.5405, "mean_token_accuracy": 0.8297041654586792, "num_tokens": 23742374.0, "step": 626 }, { "epoch": 0.07976084467624985, "ewc_loss": 0.0069580078125, "ewc_loss_diag": 2.5033950805664062e-06, "ewc_loss_parallel": 4.4405460357666016e-06, "grad_norm": 10.395936965942383, "learning_rate": 2.6536668079694786e-07, "loss": 0.6469, "mean_token_accuracy": 0.8011026382446289, "num_tokens": 23780216.0, "step": 627 }, { "epoch": 0.07988805495484035, "ewc_loss": 0.00701904296875, "ewc_loss_diag": 2.5033950805664062e-06, "ewc_loss_parallel": 4.5299530029296875e-06, "grad_norm": 10.407193183898926, "learning_rate": 2.6579058923272576e-07, "loss": 0.4997, "mean_token_accuracy": 0.8455072641372681, "num_tokens": 23813641.0, "step": 628 }, { "epoch": 0.08001526523343086, "ewc_loss": 0.00701904296875, "ewc_loss_diag": 2.5033950805664062e-06, "ewc_loss_parallel": 4.5299530029296875e-06, "grad_norm": 10.388904571533203, "learning_rate": 2.6621449766850356e-07, "loss": 0.5859, "mean_token_accuracy": 0.8196057081222534, "num_tokens": 23850189.0, "step": 629 }, { "epoch": 0.08014247551202137, "ewc_loss": 0.00701904296875, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.5299530029296875e-06, "grad_norm": 10.18160343170166, "learning_rate": 2.6663840610428145e-07, "loss": 0.5391, "mean_token_accuracy": 0.8315832614898682, "num_tokens": 23884410.0, "step": 630 }, { "epoch": 0.08026968579061187, "ewc_loss": 0.00701904296875, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.500150680541992e-06, "grad_norm": 10.219578742980957, "learning_rate": 2.6706231454005935e-07, "loss": 0.5506, "mean_token_accuracy": 0.8284903168678284, "num_tokens": 23930139.0, "step": 631 }, { "epoch": 0.08039689606920239, "ewc_loss": 0.007080078125, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.559755325317383e-06, "grad_norm": 10.112178802490234, "learning_rate": 2.6748622297583725e-07, "loss": 0.5467, "mean_token_accuracy": 0.8306071162223816, "num_tokens": 23973124.0, "step": 632 }, { "epoch": 0.0805241063477929, "ewc_loss": 0.007080078125, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.559755325317383e-06, "grad_norm": 10.244736671447754, "learning_rate": 2.6791013141161505e-07, "loss": 0.5563, "mean_token_accuracy": 0.8286371231079102, "num_tokens": 24008676.0, "step": 633 }, { "epoch": 0.08065131662638342, "ewc_loss": 0.007080078125, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.559755325317383e-06, "grad_norm": 10.2915678024292, "learning_rate": 2.6833403984739294e-07, "loss": 0.5536, "mean_token_accuracy": 0.8259528875350952, "num_tokens": 24044880.0, "step": 634 }, { "epoch": 0.08077852690497392, "ewc_loss": 0.007080078125, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.559755325317383e-06, "grad_norm": 10.113941192626953, "learning_rate": 2.6875794828317084e-07, "loss": 0.5443, "mean_token_accuracy": 0.8340169787406921, "num_tokens": 24090140.0, "step": 635 }, { "epoch": 0.08090573718356443, "ewc_loss": 0.00701904296875, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.5299530029296875e-06, "grad_norm": 10.400830268859863, "learning_rate": 2.6918185671894874e-07, "loss": 0.6308, "mean_token_accuracy": 0.8097723722457886, "num_tokens": 24123311.0, "step": 636 }, { "epoch": 0.08103294746215495, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 4.649162292480469e-06, "grad_norm": 10.369571685791016, "learning_rate": 2.6960576515472654e-07, "loss": 0.6639, "mean_token_accuracy": 0.7955778241157532, "num_tokens": 24164019.0, "step": 637 }, { "epoch": 0.08116015774074545, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.589557647705078e-06, "grad_norm": 10.267325401306152, "learning_rate": 2.7002967359050443e-07, "loss": 0.5312, "mean_token_accuracy": 0.8349112272262573, "num_tokens": 24205986.0, "step": 638 }, { "epoch": 0.08128736801933596, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.589557647705078e-06, "grad_norm": 10.391899108886719, "learning_rate": 2.7045358202628233e-07, "loss": 0.5886, "mean_token_accuracy": 0.8147741556167603, "num_tokens": 24251956.0, "step": 639 }, { "epoch": 0.08141457829792648, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.6193599700927734e-06, "grad_norm": 10.320990562438965, "learning_rate": 2.7087749046206023e-07, "loss": 0.5336, "mean_token_accuracy": 0.8311795592308044, "num_tokens": 24284990.0, "step": 640 }, { "epoch": 0.08154178857651698, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.6193599700927734e-06, "grad_norm": 10.392088890075684, "learning_rate": 2.71301398897838e-07, "loss": 0.5891, "mean_token_accuracy": 0.8173242807388306, "num_tokens": 24323305.0, "step": 641 }, { "epoch": 0.08166899885510749, "ewc_loss": 0.00714111328125, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.589557647705078e-06, "grad_norm": 10.376479148864746, "learning_rate": 2.717253073336159e-07, "loss": 0.4996, "mean_token_accuracy": 0.8425842523574829, "num_tokens": 24365847.0, "step": 642 }, { "epoch": 0.081796209133698, "ewc_loss": 0.0072021484375, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 4.649162292480469e-06, "grad_norm": 10.373919486999512, "learning_rate": 2.721492157693938e-07, "loss": 0.5073, "mean_token_accuracy": 0.8396520614624023, "num_tokens": 24407269.0, "step": 643 }, { "epoch": 0.0819234194122885, "ewc_loss": 0.0072021484375, "ewc_loss_diag": 2.5779008865356445e-06, "ewc_loss_parallel": 4.6193599700927734e-06, "grad_norm": 10.481422424316406, "learning_rate": 2.7257312420517167e-07, "loss": 0.5654, "mean_token_accuracy": 0.8223553895950317, "num_tokens": 24444990.0, "step": 644 }, { "epoch": 0.08205062969087902, "ewc_loss": 0.0072021484375, "ewc_loss_diag": 2.5779008865356445e-06, "ewc_loss_parallel": 4.6193599700927734e-06, "grad_norm": 10.326227188110352, "learning_rate": 2.729970326409495e-07, "loss": 0.5962, "mean_token_accuracy": 0.8180948495864868, "num_tokens": 24487293.0, "step": 645 }, { "epoch": 0.08217783996946953, "ewc_loss": 0.007293701171875, "ewc_loss_diag": 2.592802047729492e-06, "ewc_loss_parallel": 4.708766937255859e-06, "grad_norm": 10.503185272216797, "learning_rate": 2.734209410767274e-07, "loss": 0.5577, "mean_token_accuracy": 0.8277113437652588, "num_tokens": 24521403.0, "step": 646 }, { "epoch": 0.08230505024806005, "ewc_loss": 0.00726318359375, "ewc_loss_diag": 2.5779008865356445e-06, "ewc_loss_parallel": 4.678964614868164e-06, "grad_norm": 10.348230361938477, "learning_rate": 2.738448495125053e-07, "loss": 0.5651, "mean_token_accuracy": 0.827098548412323, "num_tokens": 24562740.0, "step": 647 }, { "epoch": 0.08243226052665055, "ewc_loss": 0.0072021484375, "ewc_loss_diag": 2.5779008865356445e-06, "ewc_loss_parallel": 4.649162292480469e-06, "grad_norm": 10.35462760925293, "learning_rate": 2.7426875794828316e-07, "loss": 0.5734, "mean_token_accuracy": 0.8171032667160034, "num_tokens": 24597507.0, "step": 648 }, { "epoch": 0.08255947080524106, "ewc_loss": 0.007293701171875, "ewc_loss_diag": 2.592802047729492e-06, "ewc_loss_parallel": 4.708766937255859e-06, "grad_norm": 10.476271629333496, "learning_rate": 2.74692666384061e-07, "loss": 0.5349, "mean_token_accuracy": 0.8318682909011841, "num_tokens": 24635629.0, "step": 649 }, { "epoch": 0.08268668108383158, "ewc_loss": 0.00732421875, "ewc_loss_diag": 2.60770320892334e-06, "ewc_loss_parallel": 4.708766937255859e-06, "grad_norm": 10.46044635772705, "learning_rate": 2.751165748198389e-07, "loss": 0.5602, "mean_token_accuracy": 0.8235682249069214, "num_tokens": 24676087.0, "step": 650 }, { "epoch": 0.08281389136242208, "ewc_loss": 0.00726318359375, "ewc_loss_diag": 2.60770320892334e-06, "ewc_loss_parallel": 4.678964614868164e-06, "grad_norm": 10.52815055847168, "learning_rate": 2.755404832556168e-07, "loss": 0.58, "mean_token_accuracy": 0.8200391530990601, "num_tokens": 24713652.0, "step": 651 }, { "epoch": 0.0829411016410126, "ewc_loss": 0.007354736328125, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 4.738569259643555e-06, "grad_norm": 10.345189094543457, "learning_rate": 2.7596439169139465e-07, "loss": 0.562, "mean_token_accuracy": 0.8269569873809814, "num_tokens": 24750130.0, "step": 652 }, { "epoch": 0.08306831191960311, "ewc_loss": 0.00732421875, "ewc_loss_diag": 2.60770320892334e-06, "ewc_loss_parallel": 4.738569259643555e-06, "grad_norm": 10.53175163269043, "learning_rate": 2.763883001271725e-07, "loss": 0.6124, "mean_token_accuracy": 0.8095457553863525, "num_tokens": 24787337.0, "step": 653 }, { "epoch": 0.08319552219819361, "ewc_loss": 0.00738525390625, "ewc_loss_diag": 2.60770320892334e-06, "ewc_loss_parallel": 4.76837158203125e-06, "grad_norm": 10.650463104248047, "learning_rate": 2.768122085629504e-07, "loss": 0.6041, "mean_token_accuracy": 0.8061116933822632, "num_tokens": 24822736.0, "step": 654 }, { "epoch": 0.08332273247678412, "ewc_loss": 0.00732421875, "ewc_loss_diag": 2.60770320892334e-06, "ewc_loss_parallel": 4.738569259643555e-06, "grad_norm": 10.59066104888916, "learning_rate": 2.772361169987283e-07, "loss": 0.5438, "mean_token_accuracy": 0.833804726600647, "num_tokens": 24858743.0, "step": 655 }, { "epoch": 0.08344994275537464, "ewc_loss": 0.007354736328125, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 4.738569259643555e-06, "grad_norm": 10.462995529174805, "learning_rate": 2.7766002543450614e-07, "loss": 0.5529, "mean_token_accuracy": 0.8252191543579102, "num_tokens": 24901286.0, "step": 656 }, { "epoch": 0.08357715303396514, "ewc_loss": 0.007354736328125, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 4.738569259643555e-06, "grad_norm": 10.393527030944824, "learning_rate": 2.78083933870284e-07, "loss": 0.6247, "mean_token_accuracy": 0.8080131411552429, "num_tokens": 24943457.0, "step": 657 }, { "epoch": 0.08370436331255565, "ewc_loss": 0.007415771484375, "ewc_loss_diag": 2.652406692504883e-06, "ewc_loss_parallel": 4.76837158203125e-06, "grad_norm": 10.379586219787598, "learning_rate": 2.785078423060619e-07, "loss": 0.5632, "mean_token_accuracy": 0.8229126334190369, "num_tokens": 24979247.0, "step": 658 }, { "epoch": 0.08383157359114617, "ewc_loss": 0.0074462890625, "ewc_loss_diag": 2.652406692504883e-06, "ewc_loss_parallel": 4.798173904418945e-06, "grad_norm": 10.47558307647705, "learning_rate": 2.789317507418398e-07, "loss": 0.5744, "mean_token_accuracy": 0.8195297718048096, "num_tokens": 25017456.0, "step": 659 }, { "epoch": 0.08395878386973668, "ewc_loss": 0.0074462890625, "ewc_loss_diag": 2.652406692504883e-06, "ewc_loss_parallel": 4.798173904418945e-06, "grad_norm": 10.592958450317383, "learning_rate": 2.7935565917761763e-07, "loss": 0.5309, "mean_token_accuracy": 0.8311802744865417, "num_tokens": 25054682.0, "step": 660 }, { "epoch": 0.08408599414832718, "ewc_loss": 0.0074462890625, "ewc_loss_diag": 2.6673078536987305e-06, "ewc_loss_parallel": 4.76837158203125e-06, "grad_norm": 10.539708137512207, "learning_rate": 2.797795676133955e-07, "loss": 0.6056, "mean_token_accuracy": 0.8103616237640381, "num_tokens": 25092501.0, "step": 661 }, { "epoch": 0.0842132044269177, "ewc_loss": 0.0074462890625, "ewc_loss_diag": 2.6673078536987305e-06, "ewc_loss_parallel": 4.798173904418945e-06, "grad_norm": 10.444877624511719, "learning_rate": 2.802034760491734e-07, "loss": 0.4709, "mean_token_accuracy": 0.851165235042572, "num_tokens": 25129253.0, "step": 662 }, { "epoch": 0.08434041470550821, "ewc_loss": 0.00750732421875, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 4.798173904418945e-06, "grad_norm": 10.458294868469238, "learning_rate": 2.806273844849512e-07, "loss": 0.5107, "mean_token_accuracy": 0.8405449390411377, "num_tokens": 25169119.0, "step": 663 }, { "epoch": 0.08446762498409871, "ewc_loss": 0.00750732421875, "ewc_loss_diag": 2.682209014892578e-06, "ewc_loss_parallel": 4.827976226806641e-06, "grad_norm": 10.514739036560059, "learning_rate": 2.810512929207291e-07, "loss": 0.5828, "mean_token_accuracy": 0.8134191036224365, "num_tokens": 25207654.0, "step": 664 }, { "epoch": 0.08459483526268922, "ewc_loss": 0.007568359375, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 4.857778549194336e-06, "grad_norm": 10.500117301940918, "learning_rate": 2.8147520135650697e-07, "loss": 0.527, "mean_token_accuracy": 0.831363320350647, "num_tokens": 25243258.0, "step": 665 }, { "epoch": 0.08472204554127974, "ewc_loss": 0.00750732421875, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 4.827976226806641e-06, "grad_norm": 10.400716781616211, "learning_rate": 2.8189910979228487e-07, "loss": 0.5674, "mean_token_accuracy": 0.8254352807998657, "num_tokens": 25281301.0, "step": 666 }, { "epoch": 0.08484925581987024, "ewc_loss": 0.007568359375, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 4.857778549194336e-06, "grad_norm": 10.613566398620605, "learning_rate": 2.823230182280627e-07, "loss": 0.5172, "mean_token_accuracy": 0.8381142616271973, "num_tokens": 25313500.0, "step": 667 }, { "epoch": 0.08497646609846075, "ewc_loss": 0.007568359375, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 4.857778549194336e-06, "grad_norm": 10.429245948791504, "learning_rate": 2.827469266638406e-07, "loss": 0.556, "mean_token_accuracy": 0.8256002068519592, "num_tokens": 25355121.0, "step": 668 }, { "epoch": 0.08510367637705127, "ewc_loss": 0.007568359375, "ewc_loss_diag": 2.726912498474121e-06, "ewc_loss_parallel": 4.857778549194336e-06, "grad_norm": 10.434967994689941, "learning_rate": 2.8317083509961846e-07, "loss": 0.5712, "mean_token_accuracy": 0.823927640914917, "num_tokens": 25398868.0, "step": 669 }, { "epoch": 0.08523088665564178, "ewc_loss": 0.0076904296875, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 4.9173831939697266e-06, "grad_norm": 10.577341079711914, "learning_rate": 2.8359474353539636e-07, "loss": 0.55, "mean_token_accuracy": 0.8272063136100769, "num_tokens": 25438670.0, "step": 670 }, { "epoch": 0.08535809693423228, "ewc_loss": 0.00762939453125, "ewc_loss_diag": 2.726912498474121e-06, "ewc_loss_parallel": 4.9173831939697266e-06, "grad_norm": 10.621338844299316, "learning_rate": 2.840186519711742e-07, "loss": 0.5974, "mean_token_accuracy": 0.8188406825065613, "num_tokens": 25479243.0, "step": 671 }, { "epoch": 0.0854853072128228, "ewc_loss": 0.00762939453125, "ewc_loss_diag": 2.726912498474121e-06, "ewc_loss_parallel": 4.9173831939697266e-06, "grad_norm": 10.515304565429688, "learning_rate": 2.844425604069521e-07, "loss": 0.5805, "mean_token_accuracy": 0.8177107572555542, "num_tokens": 25514615.0, "step": 672 }, { "epoch": 0.08561251749141331, "ewc_loss": 0.00762939453125, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 4.887580871582031e-06, "grad_norm": 10.662107467651367, "learning_rate": 2.8486646884272995e-07, "loss": 0.559, "mean_token_accuracy": 0.8248976469039917, "num_tokens": 25552753.0, "step": 673 }, { "epoch": 0.08573972777000381, "ewc_loss": 0.0078125, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 5.036592483520508e-06, "grad_norm": 10.529399871826172, "learning_rate": 2.8529037727850785e-07, "loss": 0.4943, "mean_token_accuracy": 0.8426545262336731, "num_tokens": 25587415.0, "step": 674 }, { "epoch": 0.08586693804859433, "ewc_loss": 0.0076904296875, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 4.947185516357422e-06, "grad_norm": 10.513866424560547, "learning_rate": 2.857142857142857e-07, "loss": 0.5258, "mean_token_accuracy": 0.8369107842445374, "num_tokens": 25625962.0, "step": 675 }, { "epoch": 0.08599414832718484, "ewc_loss": 0.0076904296875, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 4.947185516357422e-06, "grad_norm": 10.6256685256958, "learning_rate": 2.861381941500636e-07, "loss": 0.5739, "mean_token_accuracy": 0.8207888007164001, "num_tokens": 25669680.0, "step": 676 }, { "epoch": 0.08612135860577534, "ewc_loss": 0.0079345703125, "ewc_loss_diag": 2.8014183044433594e-06, "ewc_loss_parallel": 5.0961971282958984e-06, "grad_norm": 10.595988273620605, "learning_rate": 2.8656210258584144e-07, "loss": 0.6106, "mean_token_accuracy": 0.8080242872238159, "num_tokens": 25709221.0, "step": 677 }, { "epoch": 0.08624856888436586, "ewc_loss": 0.0078125, "ewc_loss_diag": 2.8014183044433594e-06, "ewc_loss_parallel": 5.036592483520508e-06, "grad_norm": 10.888525009155273, "learning_rate": 2.869860110216193e-07, "loss": 0.6102, "mean_token_accuracy": 0.806801438331604, "num_tokens": 25741880.0, "step": 678 }, { "epoch": 0.08637577916295637, "ewc_loss": 0.0078125, "ewc_loss_diag": 2.8014183044433594e-06, "ewc_loss_parallel": 5.036592483520508e-06, "grad_norm": 10.623059272766113, "learning_rate": 2.874099194573972e-07, "loss": 0.5417, "mean_token_accuracy": 0.8295755386352539, "num_tokens": 25786282.0, "step": 679 }, { "epoch": 0.08650298944154687, "ewc_loss": 0.007781982421875, "ewc_loss_diag": 2.8014183044433594e-06, "ewc_loss_parallel": 4.976987838745117e-06, "grad_norm": 10.729609489440918, "learning_rate": 2.878338278931751e-07, "loss": 0.5896, "mean_token_accuracy": 0.8189204931259155, "num_tokens": 25820229.0, "step": 680 }, { "epoch": 0.08663019972013739, "ewc_loss": 0.00787353515625, "ewc_loss_diag": 2.816319465637207e-06, "ewc_loss_parallel": 5.066394805908203e-06, "grad_norm": 11.009591102600098, "learning_rate": 2.8825773632895293e-07, "loss": 0.5232, "mean_token_accuracy": 0.8399022221565247, "num_tokens": 25856465.0, "step": 681 }, { "epoch": 0.0867574099987279, "ewc_loss": 0.00787353515625, "ewc_loss_diag": 2.816319465637207e-06, "ewc_loss_parallel": 5.066394805908203e-06, "grad_norm": 10.62303638458252, "learning_rate": 2.886816447647308e-07, "loss": 0.5228, "mean_token_accuracy": 0.8361087441444397, "num_tokens": 25897407.0, "step": 682 }, { "epoch": 0.08688462027731841, "ewc_loss": 0.00787353515625, "ewc_loss_diag": 2.816319465637207e-06, "ewc_loss_parallel": 5.066394805908203e-06, "grad_norm": 10.565032958984375, "learning_rate": 2.891055532005087e-07, "loss": 0.5453, "mean_token_accuracy": 0.8307037949562073, "num_tokens": 25938825.0, "step": 683 }, { "epoch": 0.08701183055590891, "ewc_loss": 0.00787353515625, "ewc_loss_diag": 2.816319465637207e-06, "ewc_loss_parallel": 5.036592483520508e-06, "grad_norm": 10.934188842773438, "learning_rate": 2.8952946163628657e-07, "loss": 0.5776, "mean_token_accuracy": 0.8167688250541687, "num_tokens": 25974944.0, "step": 684 }, { "epoch": 0.08713904083449943, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.8312206268310547e-06, "ewc_loss_parallel": 5.185604095458984e-06, "grad_norm": 10.812607765197754, "learning_rate": 2.899533700720644e-07, "loss": 0.5352, "mean_token_accuracy": 0.8304022550582886, "num_tokens": 26012864.0, "step": 685 }, { "epoch": 0.08726625111308994, "ewc_loss": 0.0079345703125, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.036592483520508e-06, "grad_norm": 10.68075942993164, "learning_rate": 2.9037727850784227e-07, "loss": 0.5821, "mean_token_accuracy": 0.8195963501930237, "num_tokens": 26049779.0, "step": 686 }, { "epoch": 0.08739346139168044, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.155801773071289e-06, "grad_norm": 11.074577331542969, "learning_rate": 2.9080118694362016e-07, "loss": 0.5518, "mean_token_accuracy": 0.8288516998291016, "num_tokens": 26089095.0, "step": 687 }, { "epoch": 0.08752067167027096, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.155801773071289e-06, "grad_norm": 10.656603813171387, "learning_rate": 2.9122509537939806e-07, "loss": 0.5512, "mean_token_accuracy": 0.8282783031463623, "num_tokens": 26132553.0, "step": 688 }, { "epoch": 0.08764788194886147, "ewc_loss": 0.0079345703125, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.066394805908203e-06, "grad_norm": 10.759723663330078, "learning_rate": 2.916490038151759e-07, "loss": 0.6012, "mean_token_accuracy": 0.8103616237640381, "num_tokens": 26171301.0, "step": 689 }, { "epoch": 0.08777509222745197, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.21540641784668e-06, "grad_norm": 10.942805290222168, "learning_rate": 2.9207291225095376e-07, "loss": 0.5894, "mean_token_accuracy": 0.8158348798751831, "num_tokens": 26217871.0, "step": 690 }, { "epoch": 0.08790230250604249, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.8759241104125977e-06, "ewc_loss_parallel": 5.21540641784668e-06, "grad_norm": 10.727195739746094, "learning_rate": 2.9249682068673166e-07, "loss": 0.5528, "mean_token_accuracy": 0.8256745338439941, "num_tokens": 26255927.0, "step": 691 }, { "epoch": 0.088029512784633, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 5.155801773071289e-06, "grad_norm": 10.942028045654297, "learning_rate": 2.9292072912250955e-07, "loss": 0.5364, "mean_token_accuracy": 0.834116518497467, "num_tokens": 26296041.0, "step": 692 }, { "epoch": 0.0881567230632235, "ewc_loss": 0.00811767578125, "ewc_loss_diag": 2.8759241104125977e-06, "ewc_loss_parallel": 5.245208740234375e-06, "grad_norm": 10.915189743041992, "learning_rate": 2.933446375582874e-07, "loss": 0.6187, "mean_token_accuracy": 0.8087100982666016, "num_tokens": 26333785.0, "step": 693 }, { "epoch": 0.08828393334181402, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.185604095458984e-06, "grad_norm": 10.770364761352539, "learning_rate": 2.9376854599406525e-07, "loss": 0.5243, "mean_token_accuracy": 0.8331782817840576, "num_tokens": 26368837.0, "step": 694 }, { "epoch": 0.08841114362040453, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.8759241104125977e-06, "ewc_loss_parallel": 5.21540641784668e-06, "grad_norm": 10.884119987487793, "learning_rate": 2.9419245442984315e-07, "loss": 0.6003, "mean_token_accuracy": 0.8152459859848022, "num_tokens": 26402151.0, "step": 695 }, { "epoch": 0.08853835389899505, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.27501106262207e-06, "grad_norm": 10.896001815795898, "learning_rate": 2.9461636286562104e-07, "loss": 0.5517, "mean_token_accuracy": 0.8263434171676636, "num_tokens": 26435550.0, "step": 696 }, { "epoch": 0.08866556417758555, "ewc_loss": 0.00811767578125, "ewc_loss_diag": 2.9206275939941406e-06, "ewc_loss_parallel": 5.21540641784668e-06, "grad_norm": 11.473077774047852, "learning_rate": 2.9504027130139884e-07, "loss": 0.5406, "mean_token_accuracy": 0.8310868144035339, "num_tokens": 26479299.0, "step": 697 }, { "epoch": 0.08879277445617606, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.27501106262207e-06, "grad_norm": 10.875436782836914, "learning_rate": 2.9546417973717674e-07, "loss": 0.578, "mean_token_accuracy": 0.8192912340164185, "num_tokens": 26519380.0, "step": 698 }, { "epoch": 0.08891998473476657, "ewc_loss": 0.008056640625, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.185604095458984e-06, "grad_norm": 10.686784744262695, "learning_rate": 2.9588808817295464e-07, "loss": 0.5044, "mean_token_accuracy": 0.839445173740387, "num_tokens": 26557306.0, "step": 699 }, { "epoch": 0.08904719501335707, "ewc_loss": 0.00811767578125, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.245208740234375e-06, "grad_norm": 11.075878143310547, "learning_rate": 2.9631199660873253e-07, "loss": 0.5756, "mean_token_accuracy": 0.8211311101913452, "num_tokens": 26593549.0, "step": 700 }, { "epoch": 0.08917440529194759, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.27501106262207e-06, "grad_norm": 10.766351699829102, "learning_rate": 2.9673590504451033e-07, "loss": 0.4994, "mean_token_accuracy": 0.8445167541503906, "num_tokens": 26629926.0, "step": 701 }, { "epoch": 0.0893016155705381, "ewc_loss": 0.00811767578125, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 5.245208740234375e-06, "grad_norm": 10.827312469482422, "learning_rate": 2.9715981348028823e-07, "loss": 0.5319, "mean_token_accuracy": 0.8345584869384766, "num_tokens": 26668652.0, "step": 702 }, { "epoch": 0.0894288258491286, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.245208740234375e-06, "grad_norm": 11.012038230895996, "learning_rate": 2.975837219160661e-07, "loss": 0.5942, "mean_token_accuracy": 0.8148205280303955, "num_tokens": 26708084.0, "step": 703 }, { "epoch": 0.08955603612771912, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.898122787475586, "learning_rate": 2.98007630351844e-07, "loss": 0.5595, "mean_token_accuracy": 0.8259550333023071, "num_tokens": 26750134.0, "step": 704 }, { "epoch": 0.08968324640630963, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.27501106262207e-06, "grad_norm": 11.072025299072266, "learning_rate": 2.984315387876218e-07, "loss": 0.5672, "mean_token_accuracy": 0.8221117258071899, "num_tokens": 26785410.0, "step": 705 }, { "epoch": 0.08981045668490013, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.88932991027832, "learning_rate": 2.988554472233997e-07, "loss": 0.5711, "mean_token_accuracy": 0.8246129751205444, "num_tokens": 26825226.0, "step": 706 }, { "epoch": 0.08993766696349065, "ewc_loss": 0.0081787109375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 11.037252426147461, "learning_rate": 2.992793556591776e-07, "loss": 0.5649, "mean_token_accuracy": 0.8230141401290894, "num_tokens": 26865401.0, "step": 707 }, { "epoch": 0.09006487724208116, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.905726432800293e-06, "ewc_loss_parallel": 5.334615707397461e-06, "grad_norm": 10.96115493774414, "learning_rate": 2.997032640949555e-07, "loss": 0.575, "mean_token_accuracy": 0.8216816186904907, "num_tokens": 26905365.0, "step": 708 }, { "epoch": 0.09019208752067168, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.9206275939941406e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.848289489746094, "learning_rate": 3.001271725307333e-07, "loss": 0.5155, "mean_token_accuracy": 0.8380424976348877, "num_tokens": 26941348.0, "step": 709 }, { "epoch": 0.09031929779926218, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.9206275939941406e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.872892379760742, "learning_rate": 3.005510809665112e-07, "loss": 0.5506, "mean_token_accuracy": 0.8283737897872925, "num_tokens": 26975803.0, "step": 710 }, { "epoch": 0.09044650807785269, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.9206275939941406e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.907176971435547, "learning_rate": 3.009749894022891e-07, "loss": 0.4964, "mean_token_accuracy": 0.8431406021118164, "num_tokens": 27014931.0, "step": 711 }, { "epoch": 0.0905737183564432, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.9206275939941406e-06, "ewc_loss_parallel": 5.334615707397461e-06, "grad_norm": 11.061509132385254, "learning_rate": 3.01398897838067e-07, "loss": 0.5548, "mean_token_accuracy": 0.8285331726074219, "num_tokens": 27055505.0, "step": 712 }, { "epoch": 0.0907009286350337, "ewc_loss": 0.00823974609375, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.304813385009766e-06, "grad_norm": 10.922492027282715, "learning_rate": 3.018228062738448e-07, "loss": 0.5813, "mean_token_accuracy": 0.8221690058708191, "num_tokens": 27092121.0, "step": 713 }, { "epoch": 0.09082813891362422, "ewc_loss": 0.00830078125, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.334615707397461e-06, "grad_norm": 10.889300346374512, "learning_rate": 3.022467147096227e-07, "loss": 0.5181, "mean_token_accuracy": 0.8372598886489868, "num_tokens": 27129827.0, "step": 714 }, { "epoch": 0.09095534919221474, "ewc_loss": 0.00830078125, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.334615707397461e-06, "grad_norm": 11.000929832458496, "learning_rate": 3.026706231454006e-07, "loss": 0.5354, "mean_token_accuracy": 0.8316991329193115, "num_tokens": 27174977.0, "step": 715 }, { "epoch": 0.09108255947080524, "ewc_loss": 0.00830078125, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.3942203521728516e-06, "grad_norm": 10.996576309204102, "learning_rate": 3.0309453158117844e-07, "loss": 0.5415, "mean_token_accuracy": 0.8289521932601929, "num_tokens": 27211472.0, "step": 716 }, { "epoch": 0.09120976974939575, "ewc_loss": 0.00830078125, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.364418029785156e-06, "grad_norm": 11.014885902404785, "learning_rate": 3.035184400169563e-07, "loss": 0.6254, "mean_token_accuracy": 0.8093525171279907, "num_tokens": 27249160.0, "step": 717 }, { "epoch": 0.09133698002798626, "ewc_loss": 0.00830078125, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 5.364418029785156e-06, "grad_norm": 11.046886444091797, "learning_rate": 3.039423484527342e-07, "loss": 0.5327, "mean_token_accuracy": 0.8322254419326782, "num_tokens": 27287946.0, "step": 718 }, { "epoch": 0.09146419030657676, "ewc_loss": 0.00836181640625, "ewc_loss_diag": 2.950429916381836e-06, "ewc_loss_parallel": 5.3942203521728516e-06, "grad_norm": 10.797239303588867, "learning_rate": 3.043662568885121e-07, "loss": 0.5676, "mean_token_accuracy": 0.8175647258758545, "num_tokens": 27323388.0, "step": 719 }, { "epoch": 0.09159140058516728, "ewc_loss": 0.00836181640625, "ewc_loss_diag": 2.950429916381836e-06, "ewc_loss_parallel": 5.3942203521728516e-06, "grad_norm": 11.172287940979004, "learning_rate": 3.0479016532428993e-07, "loss": 0.6143, "mean_token_accuracy": 0.8075889348983765, "num_tokens": 27357524.0, "step": 720 }, { "epoch": 0.0917186108637578, "ewc_loss": 0.0084228515625, "ewc_loss_diag": 2.950429916381836e-06, "ewc_loss_parallel": 5.4836273193359375e-06, "grad_norm": 11.112669944763184, "learning_rate": 3.052140737600678e-07, "loss": 0.5205, "mean_token_accuracy": 0.8359286189079285, "num_tokens": 27392037.0, "step": 721 }, { "epoch": 0.09184582114234831, "ewc_loss": 0.00848388671875, "ewc_loss_diag": 2.9802322387695312e-06, "ewc_loss_parallel": 5.4836273193359375e-06, "grad_norm": 10.862343788146973, "learning_rate": 3.056379821958457e-07, "loss": 0.5321, "mean_token_accuracy": 0.8325783014297485, "num_tokens": 27429149.0, "step": 722 }, { "epoch": 0.09197303142093881, "ewc_loss": 0.0084228515625, "ewc_loss_diag": 2.995133399963379e-06, "ewc_loss_parallel": 5.424022674560547e-06, "grad_norm": 11.185674667358398, "learning_rate": 3.060618906316236e-07, "loss": 0.5383, "mean_token_accuracy": 0.8333992958068848, "num_tokens": 27465023.0, "step": 723 }, { "epoch": 0.09210024169952932, "ewc_loss": 0.008544921875, "ewc_loss_diag": 3.0100345611572266e-06, "ewc_loss_parallel": 5.513429641723633e-06, "grad_norm": 11.143731117248535, "learning_rate": 3.064857990674014e-07, "loss": 0.5293, "mean_token_accuracy": 0.8350642919540405, "num_tokens": 27501268.0, "step": 724 }, { "epoch": 0.09222745197811984, "ewc_loss": 0.008544921875, "ewc_loss_diag": 3.0100345611572266e-06, "ewc_loss_parallel": 5.513429641723633e-06, "grad_norm": 10.906365394592285, "learning_rate": 3.0690970750317927e-07, "loss": 0.5816, "mean_token_accuracy": 0.8193930387496948, "num_tokens": 27539937.0, "step": 725 }, { "epoch": 0.09235466225671034, "ewc_loss": 0.008544921875, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 5.4836273193359375e-06, "grad_norm": 11.163132667541504, "learning_rate": 3.0733361593895717e-07, "loss": 0.5432, "mean_token_accuracy": 0.8301502466201782, "num_tokens": 27574576.0, "step": 726 }, { "epoch": 0.09248187253530085, "ewc_loss": 0.00860595703125, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 5.543231964111328e-06, "grad_norm": 11.078206062316895, "learning_rate": 3.0775752437473507e-07, "loss": 0.5526, "mean_token_accuracy": 0.8236232995986938, "num_tokens": 27616036.0, "step": 727 }, { "epoch": 0.09260908281389137, "ewc_loss": 0.008544921875, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 5.4836273193359375e-06, "grad_norm": 10.94552230834961, "learning_rate": 3.081814328105129e-07, "loss": 0.5763, "mean_token_accuracy": 0.8203385472297668, "num_tokens": 27662040.0, "step": 728 }, { "epoch": 0.09273629309248187, "ewc_loss": 0.008544921875, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 5.513429641723633e-06, "grad_norm": 11.06830883026123, "learning_rate": 3.0860534124629076e-07, "loss": 0.6058, "mean_token_accuracy": 0.8117598295211792, "num_tokens": 27705136.0, "step": 729 }, { "epoch": 0.09286350337107238, "ewc_loss": 0.00860595703125, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 5.543231964111328e-06, "grad_norm": 11.172624588012695, "learning_rate": 3.0902924968206866e-07, "loss": 0.5556, "mean_token_accuracy": 0.825718104839325, "num_tokens": 27740564.0, "step": 730 }, { "epoch": 0.0929907136496629, "ewc_loss": 0.0086669921875, "ewc_loss_diag": 3.0547380447387695e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 11.165634155273438, "learning_rate": 3.0945315811784656e-07, "loss": 0.527, "mean_token_accuracy": 0.8325815200805664, "num_tokens": 27774788.0, "step": 731 }, { "epoch": 0.0931179239282534, "ewc_loss": 0.0086669921875, "ewc_loss_diag": 3.0547380447387695e-06, "ewc_loss_parallel": 5.5730342864990234e-06, "grad_norm": 11.085196495056152, "learning_rate": 3.098770665536244e-07, "loss": 0.6731, "mean_token_accuracy": 0.7896196246147156, "num_tokens": 27818300.0, "step": 732 }, { "epoch": 0.09324513420684391, "ewc_loss": 0.0086669921875, "ewc_loss_diag": 3.0547380447387695e-06, "ewc_loss_parallel": 5.602836608886719e-06, "grad_norm": 11.198741912841797, "learning_rate": 3.1030097498940225e-07, "loss": 0.5143, "mean_token_accuracy": 0.8378680944442749, "num_tokens": 27857739.0, "step": 733 }, { "epoch": 0.09337234448543442, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 11.018009185791016, "learning_rate": 3.1072488342518015e-07, "loss": 0.5557, "mean_token_accuracy": 0.8215577006340027, "num_tokens": 27893897.0, "step": 734 }, { "epoch": 0.09349955476402494, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 10.971031188964844, "learning_rate": 3.11148791860958e-07, "loss": 0.5408, "mean_token_accuracy": 0.8276951909065247, "num_tokens": 27930511.0, "step": 735 }, { "epoch": 0.09362676504261544, "ewc_loss": 0.0086669921875, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.5730342864990234e-06, "grad_norm": 11.221235275268555, "learning_rate": 3.115727002967359e-07, "loss": 0.5595, "mean_token_accuracy": 0.8234269022941589, "num_tokens": 27961128.0, "step": 736 }, { "epoch": 0.09375397532120595, "ewc_loss": 0.0087890625, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.329445838928223, "learning_rate": 3.1199660873251374e-07, "loss": 0.5602, "mean_token_accuracy": 0.8263631463050842, "num_tokens": 28000748.0, "step": 737 }, { "epoch": 0.09388118559979647, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.662441253662109e-06, "grad_norm": 11.0958833694458, "learning_rate": 3.1242051716829164e-07, "loss": 0.5928, "mean_token_accuracy": 0.8134050965309143, "num_tokens": 28046455.0, "step": 738 }, { "epoch": 0.09400839587838697, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 11.162633895874023, "learning_rate": 3.128444256040695e-07, "loss": 0.5686, "mean_token_accuracy": 0.8238257169723511, "num_tokens": 28087026.0, "step": 739 }, { "epoch": 0.09413560615697748, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 11.008894920349121, "learning_rate": 3.132683340398474e-07, "loss": 0.4909, "mean_token_accuracy": 0.8430465459823608, "num_tokens": 28126382.0, "step": 740 }, { "epoch": 0.094262816435568, "ewc_loss": 0.0087890625, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.662441253662109e-06, "grad_norm": 11.10252857208252, "learning_rate": 3.1369224247562523e-07, "loss": 0.5188, "mean_token_accuracy": 0.8383597731590271, "num_tokens": 28161198.0, "step": 741 }, { "epoch": 0.0943900267141585, "ewc_loss": 0.0087890625, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.360219955444336, "learning_rate": 3.1411615091140313e-07, "loss": 0.4874, "mean_token_accuracy": 0.8468435406684875, "num_tokens": 28204612.0, "step": 742 }, { "epoch": 0.09451723699274901, "ewc_loss": 0.0087890625, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.662441253662109e-06, "grad_norm": 11.074166297912598, "learning_rate": 3.14540059347181e-07, "loss": 0.5413, "mean_token_accuracy": 0.8294192552566528, "num_tokens": 28241050.0, "step": 743 }, { "epoch": 0.09464444727133953, "ewc_loss": 0.00872802734375, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.632638931274414e-06, "grad_norm": 11.140063285827637, "learning_rate": 3.149639677829589e-07, "loss": 0.5004, "mean_token_accuracy": 0.8405883312225342, "num_tokens": 28278032.0, "step": 744 }, { "epoch": 0.09477165754993004, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.0994415283203125e-06, "ewc_loss_parallel": 5.7220458984375e-06, "grad_norm": 11.493786811828613, "learning_rate": 3.153878762187368e-07, "loss": 0.5883, "mean_token_accuracy": 0.8209289908409119, "num_tokens": 28315599.0, "step": 745 }, { "epoch": 0.09489886782852054, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.11434268951416e-06, "ewc_loss_parallel": 5.7220458984375e-06, "grad_norm": 11.25187873840332, "learning_rate": 3.158117846545146e-07, "loss": 0.6117, "mean_token_accuracy": 0.8121163845062256, "num_tokens": 28358504.0, "step": 746 }, { "epoch": 0.09502607810711106, "ewc_loss": 0.0087890625, "ewc_loss_diag": 3.129243850708008e-06, "ewc_loss_parallel": 5.662441253662109e-06, "grad_norm": 11.237212181091309, "learning_rate": 3.1623569309029247e-07, "loss": 0.5282, "mean_token_accuracy": 0.8333917260169983, "num_tokens": 28389626.0, "step": 747 }, { "epoch": 0.09515328838570157, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.129243850708008e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.278782844543457, "learning_rate": 3.1665960152607037e-07, "loss": 0.5118, "mean_token_accuracy": 0.8378285765647888, "num_tokens": 28430361.0, "step": 748 }, { "epoch": 0.09528049866429207, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.129243850708008e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.15079402923584, "learning_rate": 3.1708350996184826e-07, "loss": 0.5481, "mean_token_accuracy": 0.8280009031295776, "num_tokens": 28469792.0, "step": 749 }, { "epoch": 0.09540770894288259, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.129243850708008e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.332879066467285, "learning_rate": 3.175074183976261e-07, "loss": 0.5273, "mean_token_accuracy": 0.8370344638824463, "num_tokens": 28507569.0, "step": 750 }, { "epoch": 0.0955349192214731, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.129243850708008e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.246880531311035, "learning_rate": 3.1793132683340396e-07, "loss": 0.5537, "mean_token_accuracy": 0.8268546462059021, "num_tokens": 28542336.0, "step": 751 }, { "epoch": 0.0956621295000636, "ewc_loss": 0.00885009765625, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.692243576049805e-06, "grad_norm": 11.206452369689941, "learning_rate": 3.1835523526918186e-07, "loss": 0.5031, "mean_token_accuracy": 0.8414168357849121, "num_tokens": 28578587.0, "step": 752 }, { "epoch": 0.09578933977865411, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.435611724853516, "learning_rate": 3.1877914370495975e-07, "loss": 0.5832, "mean_token_accuracy": 0.8187742829322815, "num_tokens": 28617505.0, "step": 753 }, { "epoch": 0.09591655005724463, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.17590618133545, "learning_rate": 3.1920305214073755e-07, "loss": 0.5897, "mean_token_accuracy": 0.8174092769622803, "num_tokens": 28658526.0, "step": 754 }, { "epoch": 0.09604376033583513, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.7220458984375e-06, "grad_norm": 11.229044914245605, "learning_rate": 3.1962696057651545e-07, "loss": 0.5244, "mean_token_accuracy": 0.8344041109085083, "num_tokens": 28698825.0, "step": 755 }, { "epoch": 0.09617097061442564, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.751848220825195e-06, "grad_norm": 11.336013793945312, "learning_rate": 3.2005086901229335e-07, "loss": 0.5671, "mean_token_accuracy": 0.820008397102356, "num_tokens": 28733238.0, "step": 756 }, { "epoch": 0.09629818089301616, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.319973945617676, "learning_rate": 3.2047477744807125e-07, "loss": 0.5699, "mean_token_accuracy": 0.824838399887085, "num_tokens": 28771552.0, "step": 757 }, { "epoch": 0.09642539117160667, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.159046173095703e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.39318561553955, "learning_rate": 3.2089868588384904e-07, "loss": 0.6018, "mean_token_accuracy": 0.8106493949890137, "num_tokens": 28810827.0, "step": 758 }, { "epoch": 0.09655260145019717, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.159046173095703e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.359456062316895, "learning_rate": 3.2132259431962694e-07, "loss": 0.5107, "mean_token_accuracy": 0.8405283689498901, "num_tokens": 28849017.0, "step": 759 }, { "epoch": 0.09667981172878769, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.159046173095703e-06, "ewc_loss_parallel": 5.841255187988281e-06, "grad_norm": 11.391944885253906, "learning_rate": 3.2174650275540484e-07, "loss": 0.5618, "mean_token_accuracy": 0.8294236063957214, "num_tokens": 28887659.0, "step": 760 }, { "epoch": 0.0968070220073782, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.600153923034668, "learning_rate": 3.2217041119118274e-07, "loss": 0.5294, "mean_token_accuracy": 0.8328744173049927, "num_tokens": 28928983.0, "step": 761 }, { "epoch": 0.0969342322859687, "ewc_loss": 0.0089111328125, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 5.781650543212891e-06, "grad_norm": 11.339705467224121, "learning_rate": 3.2259431962696053e-07, "loss": 0.5631, "mean_token_accuracy": 0.8247895240783691, "num_tokens": 28962195.0, "step": 762 }, { "epoch": 0.09706144256455922, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.293124198913574, "learning_rate": 3.2301822806273843e-07, "loss": 0.5455, "mean_token_accuracy": 0.8275019526481628, "num_tokens": 28997351.0, "step": 763 }, { "epoch": 0.09718865284314973, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.841255187988281e-06, "grad_norm": 11.492738723754883, "learning_rate": 3.2344213649851633e-07, "loss": 0.5724, "mean_token_accuracy": 0.8225823640823364, "num_tokens": 29040988.0, "step": 764 }, { "epoch": 0.09731586312174023, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.476637840270996, "learning_rate": 3.238660449342942e-07, "loss": 0.5333, "mean_token_accuracy": 0.8301368951797485, "num_tokens": 29081873.0, "step": 765 }, { "epoch": 0.09744307340033075, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.841255187988281e-06, "grad_norm": 11.334654808044434, "learning_rate": 3.24289953370072e-07, "loss": 0.5021, "mean_token_accuracy": 0.8381909132003784, "num_tokens": 29115360.0, "step": 766 }, { "epoch": 0.09757028367892126, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.397014617919922, "learning_rate": 3.247138618058499e-07, "loss": 0.5498, "mean_token_accuracy": 0.8234409093856812, "num_tokens": 29160139.0, "step": 767 }, { "epoch": 0.09769749395751176, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.38135814666748, "learning_rate": 3.251377702416278e-07, "loss": 0.5789, "mean_token_accuracy": 0.8203089237213135, "num_tokens": 29202872.0, "step": 768 }, { "epoch": 0.09782470423610228, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.386873245239258, "learning_rate": 3.255616786774057e-07, "loss": 0.5261, "mean_token_accuracy": 0.8345717191696167, "num_tokens": 29246965.0, "step": 769 }, { "epoch": 0.09795191451469279, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.330167770385742, "learning_rate": 3.259855871131835e-07, "loss": 0.4648, "mean_token_accuracy": 0.8518603444099426, "num_tokens": 29289531.0, "step": 770 }, { "epoch": 0.0980791247932833, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.841255187988281e-06, "grad_norm": 11.327693939208984, "learning_rate": 3.264094955489614e-07, "loss": 0.5218, "mean_token_accuracy": 0.8351813554763794, "num_tokens": 29330484.0, "step": 771 }, { "epoch": 0.0982063350718738, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.392191886901855, "learning_rate": 3.268334039847393e-07, "loss": 0.5504, "mean_token_accuracy": 0.8260810375213623, "num_tokens": 29369595.0, "step": 772 }, { "epoch": 0.09833354535046432, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.588767051696777, "learning_rate": 3.2725731242051715e-07, "loss": 0.5387, "mean_token_accuracy": 0.8336396217346191, "num_tokens": 29411856.0, "step": 773 }, { "epoch": 0.09846075562905483, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 5.900859832763672e-06, "grad_norm": 11.333877563476562, "learning_rate": 3.27681220856295e-07, "loss": 0.5206, "mean_token_accuracy": 0.8379117250442505, "num_tokens": 29454754.0, "step": 774 }, { "epoch": 0.09858796590764533, "ewc_loss": 0.00897216796875, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.811452865600586e-06, "grad_norm": 11.320517539978027, "learning_rate": 3.281051292920729e-07, "loss": 0.5156, "mean_token_accuracy": 0.8345797061920166, "num_tokens": 29489131.0, "step": 775 }, { "epoch": 0.09871517618623585, "ewc_loss": 0.00909423828125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.900859832763672e-06, "grad_norm": 11.622122764587402, "learning_rate": 3.285290377278508e-07, "loss": 0.5708, "mean_token_accuracy": 0.8190103769302368, "num_tokens": 29528292.0, "step": 776 }, { "epoch": 0.09884238646482636, "ewc_loss": 0.00909423828125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.900859832763672e-06, "grad_norm": 11.425840377807617, "learning_rate": 3.2895294616362864e-07, "loss": 0.5026, "mean_token_accuracy": 0.8419467806816101, "num_tokens": 29564878.0, "step": 777 }, { "epoch": 0.09896959674341686, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.43835163116455, "learning_rate": 3.293768545994065e-07, "loss": 0.5236, "mean_token_accuracy": 0.8314638137817383, "num_tokens": 29602826.0, "step": 778 }, { "epoch": 0.09909680702200738, "ewc_loss": 0.009033203125, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 5.8710575103759766e-06, "grad_norm": 11.561243057250977, "learning_rate": 3.298007630351844e-07, "loss": 0.5913, "mean_token_accuracy": 0.8176169395446777, "num_tokens": 29640323.0, "step": 779 }, { "epoch": 0.09922401730059789, "ewc_loss": 0.0091552734375, "ewc_loss_diag": 3.203749656677246e-06, "ewc_loss_parallel": 5.990266799926758e-06, "grad_norm": 11.519773483276367, "learning_rate": 3.302246714709623e-07, "loss": 0.5211, "mean_token_accuracy": 0.8346371650695801, "num_tokens": 29683302.0, "step": 780 }, { "epoch": 0.09935122757918839, "ewc_loss": 0.0091552734375, "ewc_loss_diag": 3.203749656677246e-06, "ewc_loss_parallel": 5.9604644775390625e-06, "grad_norm": 11.550599098205566, "learning_rate": 3.3064857990674013e-07, "loss": 0.5471, "mean_token_accuracy": 0.8306752443313599, "num_tokens": 29721085.0, "step": 781 }, { "epoch": 0.0994784378577789, "ewc_loss": 0.0091552734375, "ewc_loss_diag": 3.203749656677246e-06, "ewc_loss_parallel": 5.9604644775390625e-06, "grad_norm": 11.661918640136719, "learning_rate": 3.31072488342518e-07, "loss": 0.4979, "mean_token_accuracy": 0.8414232730865479, "num_tokens": 29753810.0, "step": 782 }, { "epoch": 0.09960564813636942, "ewc_loss": 0.00921630859375, "ewc_loss_diag": 3.203749656677246e-06, "ewc_loss_parallel": 6.020069122314453e-06, "grad_norm": 11.672320365905762, "learning_rate": 3.314963967782959e-07, "loss": 0.5264, "mean_token_accuracy": 0.8345987796783447, "num_tokens": 29796488.0, "step": 783 }, { "epoch": 0.09973285841495994, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.203749656677246e-06, "ewc_loss_parallel": 6.0498714447021484e-06, "grad_norm": 11.537985801696777, "learning_rate": 3.319203052140738e-07, "loss": 0.5361, "mean_token_accuracy": 0.8313886523246765, "num_tokens": 29838181.0, "step": 784 }, { "epoch": 0.09986006869355044, "ewc_loss": 0.0091552734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 5.930662155151367e-06, "grad_norm": 11.942378997802734, "learning_rate": 3.323442136498516e-07, "loss": 0.5172, "mean_token_accuracy": 0.8369165658950806, "num_tokens": 29868893.0, "step": 785 }, { "epoch": 0.09998727897214095, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 6.0498714447021484e-06, "grad_norm": 11.68710994720459, "learning_rate": 3.3276812208562947e-07, "loss": 0.5342, "mean_token_accuracy": 0.8298812508583069, "num_tokens": 29903571.0, "step": 786 }, { "epoch": 0.10011448925073146, "ewc_loss": 0.0091552734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 5.9604644775390625e-06, "grad_norm": 11.519942283630371, "learning_rate": 3.3319203052140737e-07, "loss": 0.5911, "mean_token_accuracy": 0.8215322494506836, "num_tokens": 29940438.0, "step": 787 }, { "epoch": 0.10024169952932196, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 6.0498714447021484e-06, "grad_norm": 11.915936470031738, "learning_rate": 3.336159389571852e-07, "loss": 0.5143, "mean_token_accuracy": 0.8427681922912598, "num_tokens": 29976538.0, "step": 788 }, { "epoch": 0.10036890980791248, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 6.079673767089844e-06, "grad_norm": 11.524605751037598, "learning_rate": 3.340398473929631e-07, "loss": 0.5841, "mean_token_accuracy": 0.8175774812698364, "num_tokens": 30019814.0, "step": 789 }, { "epoch": 0.100496120086503, "ewc_loss": 0.00921630859375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 5.990266799926758e-06, "grad_norm": 11.47454833984375, "learning_rate": 3.3446375582874096e-07, "loss": 0.5334, "mean_token_accuracy": 0.8332105875015259, "num_tokens": 30057578.0, "step": 790 }, { "epoch": 0.1006233303650935, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2335519790649414e-06, "ewc_loss_parallel": 6.079673767089844e-06, "grad_norm": 11.7509765625, "learning_rate": 3.3488766426451886e-07, "loss": 0.6065, "mean_token_accuracy": 0.8191620111465454, "num_tokens": 30094869.0, "step": 791 }, { "epoch": 0.10075054064368401, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 6.079673767089844e-06, "grad_norm": 11.590584754943848, "learning_rate": 3.353115727002967e-07, "loss": 0.5672, "mean_token_accuracy": 0.8238334655761719, "num_tokens": 30136446.0, "step": 792 }, { "epoch": 0.10087775092227452, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2335519790649414e-06, "ewc_loss_parallel": 6.020069122314453e-06, "grad_norm": 11.755356788635254, "learning_rate": 3.357354811360746e-07, "loss": 0.5607, "mean_token_accuracy": 0.8206948041915894, "num_tokens": 30179226.0, "step": 793 }, { "epoch": 0.10100496120086502, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2633543014526367e-06, "ewc_loss_parallel": 6.0498714447021484e-06, "grad_norm": 11.634055137634277, "learning_rate": 3.3615938957185245e-07, "loss": 0.5526, "mean_token_accuracy": 0.8261669874191284, "num_tokens": 30213890.0, "step": 794 }, { "epoch": 0.10113217147945554, "ewc_loss": 0.00927734375, "ewc_loss_diag": 3.2633543014526367e-06, "ewc_loss_parallel": 5.990266799926758e-06, "grad_norm": 11.506845474243164, "learning_rate": 3.3658329800763035e-07, "loss": 0.6021, "mean_token_accuracy": 0.8084368109703064, "num_tokens": 30245691.0, "step": 795 }, { "epoch": 0.10125938175804605, "ewc_loss": 0.0093994140625, "ewc_loss_diag": 3.2633543014526367e-06, "ewc_loss_parallel": 6.109476089477539e-06, "grad_norm": 11.715755462646484, "learning_rate": 3.370072064434082e-07, "loss": 0.5574, "mean_token_accuracy": 0.8182634711265564, "num_tokens": 30281738.0, "step": 796 }, { "epoch": 0.10138659203663657, "ewc_loss": 0.0093994140625, "ewc_loss_diag": 3.2633543014526367e-06, "ewc_loss_parallel": 6.16908073425293e-06, "grad_norm": 11.643179893493652, "learning_rate": 3.374311148791861e-07, "loss": 0.5209, "mean_token_accuracy": 0.8340887427330017, "num_tokens": 30320707.0, "step": 797 }, { "epoch": 0.10151380231522707, "ewc_loss": 0.0093994140625, "ewc_loss_diag": 3.2633543014526367e-06, "ewc_loss_parallel": 6.16908073425293e-06, "grad_norm": 11.833746910095215, "learning_rate": 3.3785502331496394e-07, "loss": 0.5321, "mean_token_accuracy": 0.8303635716438293, "num_tokens": 30351379.0, "step": 798 }, { "epoch": 0.10164101259381758, "ewc_loss": 0.009521484375, "ewc_loss_diag": 3.2782554626464844e-06, "ewc_loss_parallel": 6.22868537902832e-06, "grad_norm": 11.846419334411621, "learning_rate": 3.3827893175074184e-07, "loss": 0.5548, "mean_token_accuracy": 0.8241889476776123, "num_tokens": 30382395.0, "step": 799 }, { "epoch": 0.1017682228724081, "ewc_loss": 0.00946044921875, "ewc_loss_diag": 3.2782554626464844e-06, "ewc_loss_parallel": 6.16908073425293e-06, "grad_norm": 11.911877632141113, "learning_rate": 3.387028401865197e-07, "loss": 0.5605, "mean_token_accuracy": 0.8276673555374146, "num_tokens": 30419758.0, "step": 800 }, { "epoch": 0.1018954331509986, "ewc_loss": 0.009521484375, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 6.198883056640625e-06, "grad_norm": 11.749504089355469, "learning_rate": 3.391267486222976e-07, "loss": 0.5431, "mean_token_accuracy": 0.8265818357467651, "num_tokens": 30458115.0, "step": 801 }, { "epoch": 0.10202264342958911, "ewc_loss": 0.009521484375, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 6.198883056640625e-06, "grad_norm": 11.697490692138672, "learning_rate": 3.3955065705807543e-07, "loss": 0.5787, "mean_token_accuracy": 0.8164491653442383, "num_tokens": 30489679.0, "step": 802 }, { "epoch": 0.10214985370817962, "ewc_loss": 0.009521484375, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 6.22868537902832e-06, "grad_norm": 11.771626472473145, "learning_rate": 3.3997456549385333e-07, "loss": 0.5144, "mean_token_accuracy": 0.838054895401001, "num_tokens": 30524874.0, "step": 803 }, { "epoch": 0.10227706398677013, "ewc_loss": 0.0096435546875, "ewc_loss_diag": 3.337860107421875e-06, "ewc_loss_parallel": 6.288290023803711e-06, "grad_norm": 11.610769271850586, "learning_rate": 3.403984739296312e-07, "loss": 0.59, "mean_token_accuracy": 0.8164246678352356, "num_tokens": 30558354.0, "step": 804 }, { "epoch": 0.10240427426536064, "ewc_loss": 0.00958251953125, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 6.288290023803711e-06, "grad_norm": 11.618374824523926, "learning_rate": 3.408223823654091e-07, "loss": 0.5192, "mean_token_accuracy": 0.8358122706413269, "num_tokens": 30594827.0, "step": 805 }, { "epoch": 0.10253148454395115, "ewc_loss": 0.009521484375, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 6.22868537902832e-06, "grad_norm": 11.67365550994873, "learning_rate": 3.412462908011869e-07, "loss": 0.5588, "mean_token_accuracy": 0.8305865526199341, "num_tokens": 30635259.0, "step": 806 }, { "epoch": 0.10265869482254165, "ewc_loss": 0.00970458984375, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 6.318092346191406e-06, "grad_norm": 11.785157203674316, "learning_rate": 3.4167019923696477e-07, "loss": 0.5524, "mean_token_accuracy": 0.831717848777771, "num_tokens": 30681850.0, "step": 807 }, { "epoch": 0.10278590510113217, "ewc_loss": 0.0096435546875, "ewc_loss_diag": 3.3527612686157227e-06, "ewc_loss_parallel": 6.318092346191406e-06, "grad_norm": 11.737857818603516, "learning_rate": 3.4209410767274267e-07, "loss": 0.5202, "mean_token_accuracy": 0.8403143286705017, "num_tokens": 30721302.0, "step": 808 }, { "epoch": 0.10291311537972268, "ewc_loss": 0.00970458984375, "ewc_loss_diag": 3.337860107421875e-06, "ewc_loss_parallel": 6.3478946685791016e-06, "grad_norm": 11.602421760559082, "learning_rate": 3.4251801610852057e-07, "loss": 0.5953, "mean_token_accuracy": 0.8119080066680908, "num_tokens": 30761975.0, "step": 809 }, { "epoch": 0.1030403256583132, "ewc_loss": 0.0096435546875, "ewc_loss_diag": 3.337860107421875e-06, "ewc_loss_parallel": 6.288290023803711e-06, "grad_norm": 11.87789249420166, "learning_rate": 3.429419245442984e-07, "loss": 0.5486, "mean_token_accuracy": 0.826143741607666, "num_tokens": 30797754.0, "step": 810 }, { "epoch": 0.1031675359369037, "ewc_loss": 0.009765625, "ewc_loss_diag": 3.3527612686157227e-06, "ewc_loss_parallel": 6.4373016357421875e-06, "grad_norm": 11.77666187286377, "learning_rate": 3.4336583298007626e-07, "loss": 0.5224, "mean_token_accuracy": 0.8364748954772949, "num_tokens": 30834183.0, "step": 811 }, { "epoch": 0.10329474621549421, "ewc_loss": 0.009765625, "ewc_loss_diag": 3.337860107421875e-06, "ewc_loss_parallel": 6.407499313354492e-06, "grad_norm": 11.68973159790039, "learning_rate": 3.4378974141585416e-07, "loss": 0.5464, "mean_token_accuracy": 0.8299481868743896, "num_tokens": 30874160.0, "step": 812 }, { "epoch": 0.10342195649408473, "ewc_loss": 0.009765625, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 6.377696990966797e-06, "grad_norm": 11.700427055358887, "learning_rate": 3.4421364985163206e-07, "loss": 0.4848, "mean_token_accuracy": 0.8460057973861694, "num_tokens": 30916827.0, "step": 813 }, { "epoch": 0.10354916677267523, "ewc_loss": 0.00982666015625, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 6.467103958129883e-06, "grad_norm": 11.708596229553223, "learning_rate": 3.446375582874099e-07, "loss": 0.5579, "mean_token_accuracy": 0.8276214003562927, "num_tokens": 30960063.0, "step": 814 }, { "epoch": 0.10367637705126574, "ewc_loss": 0.009765625, "ewc_loss_diag": 3.382563591003418e-06, "ewc_loss_parallel": 6.3478946685791016e-06, "grad_norm": 11.778634071350098, "learning_rate": 3.4506146672318775e-07, "loss": 0.5034, "mean_token_accuracy": 0.8377659916877747, "num_tokens": 30995172.0, "step": 815 }, { "epoch": 0.10380358732985626, "ewc_loss": 0.009765625, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 6.377696990966797e-06, "grad_norm": 11.772711753845215, "learning_rate": 3.4548537515896565e-07, "loss": 0.5671, "mean_token_accuracy": 0.8228509426116943, "num_tokens": 31035089.0, "step": 816 }, { "epoch": 0.10393079760844676, "ewc_loss": 0.00982666015625, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 6.4373016357421875e-06, "grad_norm": 11.864370346069336, "learning_rate": 3.4590928359474355e-07, "loss": 0.5461, "mean_token_accuracy": 0.8271535634994507, "num_tokens": 31074836.0, "step": 817 }, { "epoch": 0.10405800788703727, "ewc_loss": 0.00982666015625, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 6.4373016357421875e-06, "grad_norm": 11.711185455322266, "learning_rate": 3.463331920305214e-07, "loss": 0.566, "mean_token_accuracy": 0.8218473196029663, "num_tokens": 31114700.0, "step": 818 }, { "epoch": 0.10418521816562779, "ewc_loss": 0.0098876953125, "ewc_loss_diag": 3.4123659133911133e-06, "ewc_loss_parallel": 6.4373016357421875e-06, "grad_norm": 11.958531379699707, "learning_rate": 3.4675710046629924e-07, "loss": 0.5146, "mean_token_accuracy": 0.8374906182289124, "num_tokens": 31148258.0, "step": 819 }, { "epoch": 0.1043124284442183, "ewc_loss": 0.00994873046875, "ewc_loss_diag": 3.4123659133911133e-06, "ewc_loss_parallel": 6.5267086029052734e-06, "grad_norm": 11.744562149047852, "learning_rate": 3.4718100890207714e-07, "loss": 0.5551, "mean_token_accuracy": 0.8267620801925659, "num_tokens": 31185720.0, "step": 820 }, { "epoch": 0.1044396387228088, "ewc_loss": 0.00982666015625, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 6.4373016357421875e-06, "grad_norm": 11.851200103759766, "learning_rate": 3.4760491733785504e-07, "loss": 0.5659, "mean_token_accuracy": 0.8194506764411926, "num_tokens": 31220353.0, "step": 821 }, { "epoch": 0.10456684900139931, "ewc_loss": 0.00994873046875, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 6.556510925292969e-06, "grad_norm": 11.930932998657227, "learning_rate": 3.480288257736329e-07, "loss": 0.543, "mean_token_accuracy": 0.8275299072265625, "num_tokens": 31256477.0, "step": 822 }, { "epoch": 0.10469405927998983, "ewc_loss": 0.00994873046875, "ewc_loss_diag": 3.4123659133911133e-06, "ewc_loss_parallel": 6.5267086029052734e-06, "grad_norm": 11.74165153503418, "learning_rate": 3.4845273420941073e-07, "loss": 0.5406, "mean_token_accuracy": 0.8313028812408447, "num_tokens": 31294365.0, "step": 823 }, { "epoch": 0.10482126955858033, "ewc_loss": 0.00994873046875, "ewc_loss_diag": 3.427267074584961e-06, "ewc_loss_parallel": 6.5267086029052734e-06, "grad_norm": 11.934861183166504, "learning_rate": 3.4887664264518863e-07, "loss": 0.5352, "mean_token_accuracy": 0.828034520149231, "num_tokens": 31329357.0, "step": 824 }, { "epoch": 0.10494847983717084, "ewc_loss": 0.01007080078125, "ewc_loss_diag": 3.427267074584961e-06, "ewc_loss_parallel": 6.645917892456055e-06, "grad_norm": 11.898003578186035, "learning_rate": 3.4930055108096653e-07, "loss": 0.5402, "mean_token_accuracy": 0.8303059935569763, "num_tokens": 31372180.0, "step": 825 }, { "epoch": 0.10507569011576136, "ewc_loss": 0.01007080078125, "ewc_loss_diag": 3.4570693969726562e-06, "ewc_loss_parallel": 6.586313247680664e-06, "grad_norm": 11.875290870666504, "learning_rate": 3.497244595167443e-07, "loss": 0.5252, "mean_token_accuracy": 0.8350329399108887, "num_tokens": 31414866.0, "step": 826 }, { "epoch": 0.10520290039435186, "ewc_loss": 0.010009765625, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 6.5267086029052734e-06, "grad_norm": 12.038166046142578, "learning_rate": 3.501483679525222e-07, "loss": 0.5945, "mean_token_accuracy": 0.8143750429153442, "num_tokens": 31450583.0, "step": 827 }, { "epoch": 0.10533011067294237, "ewc_loss": 0.0101318359375, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 6.616115570068359e-06, "grad_norm": 11.781634330749512, "learning_rate": 3.505722763883001e-07, "loss": 0.5245, "mean_token_accuracy": 0.8328443765640259, "num_tokens": 31490371.0, "step": 828 }, { "epoch": 0.10545732095153289, "ewc_loss": 0.010009765625, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 6.5267086029052734e-06, "grad_norm": 11.758671760559082, "learning_rate": 3.50996184824078e-07, "loss": 0.568, "mean_token_accuracy": 0.8213675022125244, "num_tokens": 31525951.0, "step": 829 }, { "epoch": 0.10558453123012339, "ewc_loss": 0.01007080078125, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 6.586313247680664e-06, "grad_norm": 11.86920166015625, "learning_rate": 3.514200932598558e-07, "loss": 0.6071, "mean_token_accuracy": 0.8120149374008179, "num_tokens": 31570109.0, "step": 830 }, { "epoch": 0.1057117415087139, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.67572021484375e-06, "grad_norm": 12.1392240524292, "learning_rate": 3.518440016956337e-07, "loss": 0.5482, "mean_token_accuracy": 0.828649640083313, "num_tokens": 31610569.0, "step": 831 }, { "epoch": 0.10583895178730442, "ewc_loss": 0.0101318359375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.645917892456055e-06, "grad_norm": 11.68933391571045, "learning_rate": 3.522679101314116e-07, "loss": 0.5596, "mean_token_accuracy": 0.8275334239006042, "num_tokens": 31653585.0, "step": 832 }, { "epoch": 0.10596616206589493, "ewc_loss": 0.01007080078125, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.556510925292969e-06, "grad_norm": 12.007539749145508, "learning_rate": 3.526918185671895e-07, "loss": 0.5695, "mean_token_accuracy": 0.8200967907905579, "num_tokens": 31696353.0, "step": 833 }, { "epoch": 0.10609337234448543, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.67572021484375e-06, "grad_norm": 12.102160453796387, "learning_rate": 3.531157270029673e-07, "loss": 0.5008, "mean_token_accuracy": 0.8419973254203796, "num_tokens": 31730620.0, "step": 834 }, { "epoch": 0.10622058262307595, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.705522537231445e-06, "grad_norm": 12.006048202514648, "learning_rate": 3.535396354387452e-07, "loss": 0.5159, "mean_token_accuracy": 0.8354555368423462, "num_tokens": 31770538.0, "step": 835 }, { "epoch": 0.10634779290166646, "ewc_loss": 0.0101318359375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.645917892456055e-06, "grad_norm": 12.180022239685059, "learning_rate": 3.539635438745231e-07, "loss": 0.5593, "mean_token_accuracy": 0.8225799798965454, "num_tokens": 31808687.0, "step": 836 }, { "epoch": 0.10647500318025696, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.67572021484375e-06, "grad_norm": 12.087685585021973, "learning_rate": 3.54387452310301e-07, "loss": 0.5373, "mean_token_accuracy": 0.8327096104621887, "num_tokens": 31852310.0, "step": 837 }, { "epoch": 0.10660221345884748, "ewc_loss": 0.0101318359375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.616115570068359e-06, "grad_norm": 11.883902549743652, "learning_rate": 3.548113607460788e-07, "loss": 0.5071, "mean_token_accuracy": 0.8424466848373413, "num_tokens": 31887897.0, "step": 838 }, { "epoch": 0.10672942373743799, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.705522537231445e-06, "grad_norm": 12.192357063293457, "learning_rate": 3.552352691818567e-07, "loss": 0.5326, "mean_token_accuracy": 0.8336808681488037, "num_tokens": 31926159.0, "step": 839 }, { "epoch": 0.10685663401602849, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.705522537231445e-06, "grad_norm": 12.050889015197754, "learning_rate": 3.556591776176346e-07, "loss": 0.5054, "mean_token_accuracy": 0.8411564826965332, "num_tokens": 31968432.0, "step": 840 }, { "epoch": 0.106983844294619, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.67572021484375e-06, "grad_norm": 12.10787582397461, "learning_rate": 3.560830860534125e-07, "loss": 0.5413, "mean_token_accuracy": 0.8319170475006104, "num_tokens": 32000949.0, "step": 841 }, { "epoch": 0.10711105457320952, "ewc_loss": 0.01025390625, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 6.705522537231445e-06, "grad_norm": 12.455687522888184, "learning_rate": 3.565069944891903e-07, "loss": 0.5848, "mean_token_accuracy": 0.8167003989219666, "num_tokens": 32033862.0, "step": 842 }, { "epoch": 0.10723826485180002, "ewc_loss": 0.01025390625, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.765127182006836e-06, "grad_norm": 11.9087553024292, "learning_rate": 3.569309029249682e-07, "loss": 0.5455, "mean_token_accuracy": 0.8299582004547119, "num_tokens": 32068623.0, "step": 843 }, { "epoch": 0.10736547513039053, "ewc_loss": 0.01019287109375, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.705522537231445e-06, "grad_norm": 12.255345344543457, "learning_rate": 3.573548113607461e-07, "loss": 0.5313, "mean_token_accuracy": 0.8337640166282654, "num_tokens": 32106658.0, "step": 844 }, { "epoch": 0.10749268540898105, "ewc_loss": 0.01031494140625, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 6.794929504394531e-06, "grad_norm": 12.22609806060791, "learning_rate": 3.577787197965239e-07, "loss": 0.5756, "mean_token_accuracy": 0.8180879950523376, "num_tokens": 32142031.0, "step": 845 }, { "epoch": 0.10761989568757156, "ewc_loss": 0.01031494140625, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 6.794929504394531e-06, "grad_norm": 11.927810668945312, "learning_rate": 3.5820262823230177e-07, "loss": 0.5042, "mean_token_accuracy": 0.8449567556381226, "num_tokens": 32182686.0, "step": 846 }, { "epoch": 0.10774710596616206, "ewc_loss": 0.01031494140625, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.794929504394531e-06, "grad_norm": 12.22203254699707, "learning_rate": 3.5862653666807967e-07, "loss": 0.6025, "mean_token_accuracy": 0.8126428723335266, "num_tokens": 32218027.0, "step": 847 }, { "epoch": 0.10787431624475258, "ewc_loss": 0.01031494140625, "ewc_loss_diag": 3.5017728805541992e-06, "ewc_loss_parallel": 6.8247318267822266e-06, "grad_norm": 11.898058891296387, "learning_rate": 3.5905044510385757e-07, "loss": 0.5555, "mean_token_accuracy": 0.8288483619689941, "num_tokens": 32257854.0, "step": 848 }, { "epoch": 0.10800152652334309, "ewc_loss": 0.01025390625, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 6.765127182006836e-06, "grad_norm": 12.299856185913086, "learning_rate": 3.594743535396354e-07, "loss": 0.5382, "mean_token_accuracy": 0.8313270807266235, "num_tokens": 32295019.0, "step": 849 }, { "epoch": 0.10812873680193359, "ewc_loss": 0.0103759765625, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 6.854534149169922e-06, "grad_norm": 12.132373809814453, "learning_rate": 3.5989826197541326e-07, "loss": 0.5889, "mean_token_accuracy": 0.8165103197097778, "num_tokens": 32335652.0, "step": 850 }, { "epoch": 0.1082559470805241, "ewc_loss": 0.0103759765625, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 6.8247318267822266e-06, "grad_norm": 12.150800704956055, "learning_rate": 3.6032217041119116e-07, "loss": 0.536, "mean_token_accuracy": 0.8272740840911865, "num_tokens": 32377249.0, "step": 851 }, { "epoch": 0.10838315735911462, "ewc_loss": 0.0103759765625, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 6.854534149169922e-06, "grad_norm": 12.094446182250977, "learning_rate": 3.6074607884696906e-07, "loss": 0.5072, "mean_token_accuracy": 0.8430112600326538, "num_tokens": 32422467.0, "step": 852 }, { "epoch": 0.10851036763770512, "ewc_loss": 0.0103759765625, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 6.854534149169922e-06, "grad_norm": 12.048348426818848, "learning_rate": 3.611699872827469e-07, "loss": 0.4997, "mean_token_accuracy": 0.8419670462608337, "num_tokens": 32462120.0, "step": 853 }, { "epoch": 0.10863757791629564, "ewc_loss": 0.01043701171875, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 6.884336471557617e-06, "grad_norm": 12.049881935119629, "learning_rate": 3.6159389571852475e-07, "loss": 0.6199, "mean_token_accuracy": 0.8091769814491272, "num_tokens": 32498484.0, "step": 854 }, { "epoch": 0.10876478819488615, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.546476364135742e-06, "ewc_loss_parallel": 6.943941116333008e-06, "grad_norm": 12.547281265258789, "learning_rate": 3.6201780415430265e-07, "loss": 0.5528, "mean_token_accuracy": 0.8277314901351929, "num_tokens": 32541536.0, "step": 855 }, { "epoch": 0.10889199847347665, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.546476364135742e-06, "ewc_loss_parallel": 6.973743438720703e-06, "grad_norm": 12.065690040588379, "learning_rate": 3.6244171259008055e-07, "loss": 0.5784, "mean_token_accuracy": 0.8229324817657471, "num_tokens": 32579750.0, "step": 856 }, { "epoch": 0.10901920875206716, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.546476364135742e-06, "ewc_loss_parallel": 6.9141387939453125e-06, "grad_norm": 12.058600425720215, "learning_rate": 3.628656210258584e-07, "loss": 0.5847, "mean_token_accuracy": 0.817034125328064, "num_tokens": 32617518.0, "step": 857 }, { "epoch": 0.10914641903065768, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.546476364135742e-06, "ewc_loss_parallel": 6.9141387939453125e-06, "grad_norm": 12.329895973205566, "learning_rate": 3.6328952946163624e-07, "loss": 0.526, "mean_token_accuracy": 0.8338647484779358, "num_tokens": 32653931.0, "step": 858 }, { "epoch": 0.1092736293092482, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 6.943941116333008e-06, "grad_norm": 12.238507270812988, "learning_rate": 3.6371343789741414e-07, "loss": 0.5144, "mean_token_accuracy": 0.837037205696106, "num_tokens": 32693797.0, "step": 859 }, { "epoch": 0.1094008395878387, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 6.9141387939453125e-06, "grad_norm": 12.105161666870117, "learning_rate": 3.6413734633319204e-07, "loss": 0.5633, "mean_token_accuracy": 0.8244925737380981, "num_tokens": 32732432.0, "step": 860 }, { "epoch": 0.10952804986642921, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 6.973743438720703e-06, "grad_norm": 12.49409008026123, "learning_rate": 3.645612547689699e-07, "loss": 0.5149, "mean_token_accuracy": 0.8337586522102356, "num_tokens": 32769225.0, "step": 861 }, { "epoch": 0.10965526014501972, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 6.973743438720703e-06, "grad_norm": 12.209551811218262, "learning_rate": 3.6498516320474773e-07, "loss": 0.5093, "mean_token_accuracy": 0.841839611530304, "num_tokens": 32812027.0, "step": 862 }, { "epoch": 0.10978247042361022, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 6.9141387939453125e-06, "grad_norm": 12.175117492675781, "learning_rate": 3.6540907164052563e-07, "loss": 0.4891, "mean_token_accuracy": 0.8468902111053467, "num_tokens": 32849334.0, "step": 863 }, { "epoch": 0.10990968070220074, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 6.973743438720703e-06, "grad_norm": 12.32972240447998, "learning_rate": 3.658329800763035e-07, "loss": 0.5825, "mean_token_accuracy": 0.8166821002960205, "num_tokens": 32888422.0, "step": 864 }, { "epoch": 0.11003689098079125, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.0035457611083984e-06, "grad_norm": 12.1546630859375, "learning_rate": 3.662568885120814e-07, "loss": 0.5419, "mean_token_accuracy": 0.8283295631408691, "num_tokens": 32922672.0, "step": 865 }, { "epoch": 0.11016410125938175, "ewc_loss": 0.010498046875, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 6.9141387939453125e-06, "grad_norm": 12.217672348022461, "learning_rate": 3.666807969478592e-07, "loss": 0.5708, "mean_token_accuracy": 0.8179912567138672, "num_tokens": 32961694.0, "step": 866 }, { "epoch": 0.11029131153797227, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 7.033348083496094e-06, "grad_norm": 12.426877975463867, "learning_rate": 3.671047053836371e-07, "loss": 0.5608, "mean_token_accuracy": 0.822611927986145, "num_tokens": 32998595.0, "step": 867 }, { "epoch": 0.11041852181656278, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.606081008911133e-06, "ewc_loss_parallel": 6.943941116333008e-06, "grad_norm": 12.338521957397461, "learning_rate": 3.6752861381941497e-07, "loss": 0.5186, "mean_token_accuracy": 0.8346112966537476, "num_tokens": 33035884.0, "step": 868 }, { "epoch": 0.11054573209515328, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 6.973743438720703e-06, "grad_norm": 12.170419692993164, "learning_rate": 3.6795252225519287e-07, "loss": 0.5823, "mean_token_accuracy": 0.8210468888282776, "num_tokens": 33073335.0, "step": 869 }, { "epoch": 0.1106729423737438, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.0035457611083984e-06, "grad_norm": 12.1697359085083, "learning_rate": 3.6837643069097077e-07, "loss": 0.4805, "mean_token_accuracy": 0.847795844078064, "num_tokens": 33111443.0, "step": 870 }, { "epoch": 0.11080015265233431, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.0035457611083984e-06, "grad_norm": 12.135433197021484, "learning_rate": 3.688003391267486e-07, "loss": 0.5403, "mean_token_accuracy": 0.8334741592407227, "num_tokens": 33156362.0, "step": 871 }, { "epoch": 0.11092736293092482, "ewc_loss": 0.01055908203125, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.0035457611083984e-06, "grad_norm": 12.053173065185547, "learning_rate": 3.6922424756252646e-07, "loss": 0.5092, "mean_token_accuracy": 0.8395032286643982, "num_tokens": 33204968.0, "step": 872 }, { "epoch": 0.11105457320951533, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.063150405883789e-06, "grad_norm": 12.433691024780273, "learning_rate": 3.6964815599830436e-07, "loss": 0.5584, "mean_token_accuracy": 0.822493851184845, "num_tokens": 33239265.0, "step": 873 }, { "epoch": 0.11118178348810584, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.033348083496094e-06, "grad_norm": 12.212389945983887, "learning_rate": 3.7007206443408226e-07, "loss": 0.489, "mean_token_accuracy": 0.8470444083213806, "num_tokens": 33275897.0, "step": 874 }, { "epoch": 0.11130899376669635, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.033348083496094e-06, "grad_norm": 12.061684608459473, "learning_rate": 3.704959728698601e-07, "loss": 0.6155, "mean_token_accuracy": 0.8090415596961975, "num_tokens": 33313966.0, "step": 875 }, { "epoch": 0.11143620404528685, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.033348083496094e-06, "grad_norm": 12.204076766967773, "learning_rate": 3.7091988130563795e-07, "loss": 0.5582, "mean_token_accuracy": 0.8255020380020142, "num_tokens": 33352909.0, "step": 876 }, { "epoch": 0.11156341432387737, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.063150405883789e-06, "grad_norm": 12.17343807220459, "learning_rate": 3.7134378974141585e-07, "loss": 0.54, "mean_token_accuracy": 0.8318983912467957, "num_tokens": 33400120.0, "step": 877 }, { "epoch": 0.11169062460246788, "ewc_loss": 0.0106201171875, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 7.063150405883789e-06, "grad_norm": 12.15155029296875, "learning_rate": 3.7176769817719375e-07, "loss": 0.5705, "mean_token_accuracy": 0.8217639923095703, "num_tokens": 33436703.0, "step": 878 }, { "epoch": 0.11181783488105838, "ewc_loss": 0.0107421875, "ewc_loss_diag": 3.606081008911133e-06, "ewc_loss_parallel": 7.12275505065918e-06, "grad_norm": 12.266477584838867, "learning_rate": 3.7219160661297154e-07, "loss": 0.478, "mean_token_accuracy": 0.8510541915893555, "num_tokens": 33473515.0, "step": 879 }, { "epoch": 0.1119450451596489, "ewc_loss": 0.01068115234375, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.092952728271484e-06, "grad_norm": 12.18114185333252, "learning_rate": 3.7261551504874944e-07, "loss": 0.5405, "mean_token_accuracy": 0.8325671553611755, "num_tokens": 33507659.0, "step": 880 }, { "epoch": 0.11207225543823941, "ewc_loss": 0.01068115234375, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.092952728271484e-06, "grad_norm": 12.043556213378906, "learning_rate": 3.7303942348452734e-07, "loss": 0.486, "mean_token_accuracy": 0.8449992537498474, "num_tokens": 33542430.0, "step": 881 }, { "epoch": 0.11219946571682991, "ewc_loss": 0.01068115234375, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.092952728271484e-06, "grad_norm": 12.254361152648926, "learning_rate": 3.7346333192030524e-07, "loss": 0.4913, "mean_token_accuracy": 0.8482134342193604, "num_tokens": 33582579.0, "step": 882 }, { "epoch": 0.11232667599542043, "ewc_loss": 0.0107421875, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.12275505065918e-06, "grad_norm": 12.170867919921875, "learning_rate": 3.7388724035608303e-07, "loss": 0.5162, "mean_token_accuracy": 0.8409058451652527, "num_tokens": 33625928.0, "step": 883 }, { "epoch": 0.11245388627401094, "ewc_loss": 0.01068115234375, "ewc_loss_diag": 3.6209821701049805e-06, "ewc_loss_parallel": 7.092952728271484e-06, "grad_norm": 12.200928688049316, "learning_rate": 3.7431114879186093e-07, "loss": 0.5589, "mean_token_accuracy": 0.8258332014083862, "num_tokens": 33661325.0, "step": 884 }, { "epoch": 0.11258109655260146, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.6507844924926758e-06, "ewc_loss_parallel": 7.212162017822266e-06, "grad_norm": 12.344202041625977, "learning_rate": 3.7473505722763883e-07, "loss": 0.5812, "mean_token_accuracy": 0.8176634907722473, "num_tokens": 33696472.0, "step": 885 }, { "epoch": 0.11270830683119196, "ewc_loss": 0.0107421875, "ewc_loss_diag": 3.6507844924926758e-06, "ewc_loss_parallel": 7.12275505065918e-06, "grad_norm": 12.289673805236816, "learning_rate": 3.7515896566341673e-07, "loss": 0.5365, "mean_token_accuracy": 0.8294664621353149, "num_tokens": 33732637.0, "step": 886 }, { "epoch": 0.11283551710978247, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 7.18235969543457e-06, "grad_norm": 12.334026336669922, "learning_rate": 3.755828740991945e-07, "loss": 0.5593, "mean_token_accuracy": 0.8249510526657104, "num_tokens": 33775106.0, "step": 887 }, { "epoch": 0.11296272738837299, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 7.18235969543457e-06, "grad_norm": 12.39757251739502, "learning_rate": 3.760067825349724e-07, "loss": 0.5214, "mean_token_accuracy": 0.8389327526092529, "num_tokens": 33811297.0, "step": 888 }, { "epoch": 0.11308993766696349, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.680586814880371e-06, "ewc_loss_parallel": 7.212162017822266e-06, "grad_norm": 12.309104919433594, "learning_rate": 3.764306909707503e-07, "loss": 0.4825, "mean_token_accuracy": 0.8464196920394897, "num_tokens": 33854038.0, "step": 889 }, { "epoch": 0.113217147945554, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 7.212162017822266e-06, "grad_norm": 12.317877769470215, "learning_rate": 3.768545994065282e-07, "loss": 0.5399, "mean_token_accuracy": 0.8301399350166321, "num_tokens": 33884929.0, "step": 890 }, { "epoch": 0.11334435822414451, "ewc_loss": 0.0108642578125, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 7.18235969543457e-06, "grad_norm": 12.205971717834473, "learning_rate": 3.77278507842306e-07, "loss": 0.5476, "mean_token_accuracy": 0.8311865925788879, "num_tokens": 33926065.0, "step": 891 }, { "epoch": 0.11347156850273502, "ewc_loss": 0.010986328125, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 7.271766662597656e-06, "grad_norm": 12.271719932556152, "learning_rate": 3.777024162780839e-07, "loss": 0.4968, "mean_token_accuracy": 0.840999960899353, "num_tokens": 33965557.0, "step": 892 }, { "epoch": 0.11359877878132553, "ewc_loss": 0.010986328125, "ewc_loss_diag": 3.680586814880371e-06, "ewc_loss_parallel": 7.3015689849853516e-06, "grad_norm": 12.377504348754883, "learning_rate": 3.781263247138618e-07, "loss": 0.5077, "mean_token_accuracy": 0.8399567604064941, "num_tokens": 34004132.0, "step": 893 }, { "epoch": 0.11372598905991604, "ewc_loss": 0.010986328125, "ewc_loss_diag": 3.7103891372680664e-06, "ewc_loss_parallel": 7.3015689849853516e-06, "grad_norm": 12.30410385131836, "learning_rate": 3.785502331496397e-07, "loss": 0.5358, "mean_token_accuracy": 0.823409378528595, "num_tokens": 34039781.0, "step": 894 }, { "epoch": 0.11385319933850654, "ewc_loss": 0.01104736328125, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 7.361173629760742e-06, "grad_norm": 12.437586784362793, "learning_rate": 3.789741415854175e-07, "loss": 0.5639, "mean_token_accuracy": 0.8221169114112854, "num_tokens": 34080487.0, "step": 895 }, { "epoch": 0.11398040961709706, "ewc_loss": 0.010986328125, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 7.331371307373047e-06, "grad_norm": 12.322785377502441, "learning_rate": 3.793980500211954e-07, "loss": 0.5379, "mean_token_accuracy": 0.8309744000434875, "num_tokens": 34119913.0, "step": 896 }, { "epoch": 0.11410761989568757, "ewc_loss": 0.0111083984375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.361173629760742e-06, "grad_norm": 12.438362121582031, "learning_rate": 3.798219584569733e-07, "loss": 0.4731, "mean_token_accuracy": 0.8497599363327026, "num_tokens": 34155811.0, "step": 897 }, { "epoch": 0.11423483017427809, "ewc_loss": 0.0111083984375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.3909759521484375e-06, "grad_norm": 12.472588539123535, "learning_rate": 3.8024586689275115e-07, "loss": 0.5952, "mean_token_accuracy": 0.8181651830673218, "num_tokens": 34195022.0, "step": 898 }, { "epoch": 0.11436204045286859, "ewc_loss": 0.01116943359375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.450580596923828e-06, "grad_norm": 12.382680892944336, "learning_rate": 3.80669775328529e-07, "loss": 0.528, "mean_token_accuracy": 0.8305068612098694, "num_tokens": 34229182.0, "step": 899 }, { "epoch": 0.1144892507314591, "ewc_loss": 0.01116943359375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.450580596923828e-06, "grad_norm": 12.499970436096191, "learning_rate": 3.810936837643069e-07, "loss": 0.4859, "mean_token_accuracy": 0.8448278903961182, "num_tokens": 34266931.0, "step": 900 }, { "epoch": 0.11461646101004962, "ewc_loss": 0.0111083984375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.420778274536133e-06, "grad_norm": 12.347949028015137, "learning_rate": 3.815175922000848e-07, "loss": 0.4801, "mean_token_accuracy": 0.844977855682373, "num_tokens": 34301705.0, "step": 901 }, { "epoch": 0.11474367128864012, "ewc_loss": 0.01116943359375, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.450580596923828e-06, "grad_norm": 12.694981575012207, "learning_rate": 3.8194150063586264e-07, "loss": 0.5642, "mean_token_accuracy": 0.8281115889549255, "num_tokens": 34340975.0, "step": 902 }, { "epoch": 0.11487088156723063, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.7550926208496094e-06, "ewc_loss_parallel": 7.4803829193115234e-06, "grad_norm": 12.732379913330078, "learning_rate": 3.823654090716405e-07, "loss": 0.53, "mean_token_accuracy": 0.8317594528198242, "num_tokens": 34380024.0, "step": 903 }, { "epoch": 0.11499809184582115, "ewc_loss": 0.01116943359375, "ewc_loss_diag": 3.7550926208496094e-06, "ewc_loss_parallel": 7.420778274536133e-06, "grad_norm": 12.537456512451172, "learning_rate": 3.827893175074184e-07, "loss": 0.5515, "mean_token_accuracy": 0.8339769840240479, "num_tokens": 34419654.0, "step": 904 }, { "epoch": 0.11512530212441165, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.7401914596557617e-06, "ewc_loss_parallel": 7.4803829193115234e-06, "grad_norm": 12.578522682189941, "learning_rate": 3.832132259431963e-07, "loss": 0.4825, "mean_token_accuracy": 0.8489968776702881, "num_tokens": 34453034.0, "step": 905 }, { "epoch": 0.11525251240300216, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.510185241699219e-06, "grad_norm": 12.611087799072266, "learning_rate": 3.8363713437897413e-07, "loss": 0.5898, "mean_token_accuracy": 0.8155866861343384, "num_tokens": 34493694.0, "step": 906 }, { "epoch": 0.11537972268159268, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.725290298461914e-06, "ewc_loss_parallel": 7.539987564086914e-06, "grad_norm": 12.630712509155273, "learning_rate": 3.8406104281475197e-07, "loss": 0.5819, "mean_token_accuracy": 0.8207903504371643, "num_tokens": 34531636.0, "step": 907 }, { "epoch": 0.11550693296018319, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.4803829193115234e-06, "grad_norm": 12.779115676879883, "learning_rate": 3.8448495125052987e-07, "loss": 0.5657, "mean_token_accuracy": 0.8201574087142944, "num_tokens": 34570851.0, "step": 908 }, { "epoch": 0.11563414323877369, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.450580596923828e-06, "grad_norm": 12.378973007202148, "learning_rate": 3.8490885968630777e-07, "loss": 0.517, "mean_token_accuracy": 0.8389737010002136, "num_tokens": 34613314.0, "step": 909 }, { "epoch": 0.1157613535173642, "ewc_loss": 0.01116943359375, "ewc_loss_diag": 3.7550926208496094e-06, "ewc_loss_parallel": 7.420778274536133e-06, "grad_norm": 12.548285484313965, "learning_rate": 3.853327681220856e-07, "loss": 0.5322, "mean_token_accuracy": 0.8313642740249634, "num_tokens": 34656701.0, "step": 910 }, { "epoch": 0.11588856379595472, "ewc_loss": 0.0113525390625, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.569789886474609e-06, "grad_norm": 12.550936698913574, "learning_rate": 3.8575667655786346e-07, "loss": 0.5078, "mean_token_accuracy": 0.8403441905975342, "num_tokens": 34688595.0, "step": 911 }, { "epoch": 0.11601577407454522, "ewc_loss": 0.01123046875, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.4803829193115234e-06, "grad_norm": 12.54187297821045, "learning_rate": 3.8618058499364136e-07, "loss": 0.5261, "mean_token_accuracy": 0.8353939652442932, "num_tokens": 34726609.0, "step": 912 }, { "epoch": 0.11614298435313573, "ewc_loss": 0.01141357421875, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.62939453125e-06, "grad_norm": 12.516767501831055, "learning_rate": 3.8660449342941926e-07, "loss": 0.5845, "mean_token_accuracy": 0.8210448622703552, "num_tokens": 34769213.0, "step": 913 }, { "epoch": 0.11627019463172625, "ewc_loss": 0.0113525390625, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.539987564086914e-06, "grad_norm": 12.674925804138184, "learning_rate": 3.870284018651971e-07, "loss": 0.5551, "mean_token_accuracy": 0.8269134759902954, "num_tokens": 34806722.0, "step": 914 }, { "epoch": 0.11639740491031675, "ewc_loss": 0.0113525390625, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.569789886474609e-06, "grad_norm": 12.469508171081543, "learning_rate": 3.8745231030097495e-07, "loss": 0.5112, "mean_token_accuracy": 0.8414797782897949, "num_tokens": 34846408.0, "step": 915 }, { "epoch": 0.11652461518890726, "ewc_loss": 0.0113525390625, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.569789886474609e-06, "grad_norm": 12.499421119689941, "learning_rate": 3.8787621873675285e-07, "loss": 0.4974, "mean_token_accuracy": 0.8420229554176331, "num_tokens": 34883791.0, "step": 916 }, { "epoch": 0.11665182546749778, "ewc_loss": 0.011474609375, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.68899917602539e-06, "grad_norm": 12.656282424926758, "learning_rate": 3.883001271725307e-07, "loss": 0.4935, "mean_token_accuracy": 0.8447864651679993, "num_tokens": 34922768.0, "step": 917 }, { "epoch": 0.11677903574608828, "ewc_loss": 0.01129150390625, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 7.510185241699219e-06, "grad_norm": 12.586715698242188, "learning_rate": 3.887240356083086e-07, "loss": 0.4742, "mean_token_accuracy": 0.8490187525749207, "num_tokens": 34962184.0, "step": 918 }, { "epoch": 0.11690624602467879, "ewc_loss": 0.01141357421875, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.62939453125e-06, "grad_norm": 12.644990921020508, "learning_rate": 3.8914794404408644e-07, "loss": 0.5187, "mean_token_accuracy": 0.8356360197067261, "num_tokens": 34991455.0, "step": 919 }, { "epoch": 0.1170334563032693, "ewc_loss": 0.01141357421875, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.62939453125e-06, "grad_norm": 12.67175006866455, "learning_rate": 3.8957185247986434e-07, "loss": 0.5476, "mean_token_accuracy": 0.8267253637313843, "num_tokens": 35025044.0, "step": 920 }, { "epoch": 0.11716066658185982, "ewc_loss": 0.01141357421875, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.62939453125e-06, "grad_norm": 12.50790023803711, "learning_rate": 3.899957609156422e-07, "loss": 0.5212, "mean_token_accuracy": 0.8354080319404602, "num_tokens": 35064564.0, "step": 921 }, { "epoch": 0.11728787686045032, "ewc_loss": 0.011474609375, "ewc_loss_diag": 3.7848949432373047e-06, "ewc_loss_parallel": 7.68899917602539e-06, "grad_norm": 12.658821105957031, "learning_rate": 3.904196693514201e-07, "loss": 0.5388, "mean_token_accuracy": 0.8321715593338013, "num_tokens": 35109786.0, "step": 922 }, { "epoch": 0.11741508713904084, "ewc_loss": 0.01153564453125, "ewc_loss_diag": 3.7997961044311523e-06, "ewc_loss_parallel": 7.748603820800781e-06, "grad_norm": 12.784496307373047, "learning_rate": 3.9084357778719793e-07, "loss": 0.6027, "mean_token_accuracy": 0.8112160563468933, "num_tokens": 35148351.0, "step": 923 }, { "epoch": 0.11754229741763135, "ewc_loss": 0.01153564453125, "ewc_loss_diag": 3.7997961044311523e-06, "ewc_loss_parallel": 7.748603820800781e-06, "grad_norm": 12.596053123474121, "learning_rate": 3.9126748622297583e-07, "loss": 0.5667, "mean_token_accuracy": 0.8249818682670593, "num_tokens": 35183464.0, "step": 924 }, { "epoch": 0.11766950769622185, "ewc_loss": 0.0115966796875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.748603820800781e-06, "grad_norm": 12.541327476501465, "learning_rate": 3.916913946587537e-07, "loss": 0.5485, "mean_token_accuracy": 0.826337456703186, "num_tokens": 35218561.0, "step": 925 }, { "epoch": 0.11779671797481236, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.689270973205566, "learning_rate": 3.921153030945316e-07, "loss": 0.563, "mean_token_accuracy": 0.8226229548454285, "num_tokens": 35257740.0, "step": 926 }, { "epoch": 0.11792392825340288, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.550131797790527, "learning_rate": 3.925392115303094e-07, "loss": 0.5185, "mean_token_accuracy": 0.8369450569152832, "num_tokens": 35291632.0, "step": 927 }, { "epoch": 0.11805113853199338, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.618796348571777, "learning_rate": 3.929631199660873e-07, "loss": 0.5237, "mean_token_accuracy": 0.8328791856765747, "num_tokens": 35327686.0, "step": 928 }, { "epoch": 0.1181783488105839, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.8590087890625, "learning_rate": 3.9338702840186517e-07, "loss": 0.5255, "mean_token_accuracy": 0.8325920104980469, "num_tokens": 35364480.0, "step": 929 }, { "epoch": 0.11830555908917441, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.965036392211914, "learning_rate": 3.9381093683764307e-07, "loss": 0.5308, "mean_token_accuracy": 0.8340811729431152, "num_tokens": 35406017.0, "step": 930 }, { "epoch": 0.11843276936776491, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.681647300720215, "learning_rate": 3.942348452734209e-07, "loss": 0.5901, "mean_token_accuracy": 0.8157840967178345, "num_tokens": 35448447.0, "step": 931 }, { "epoch": 0.11855997964635542, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.579245567321777, "learning_rate": 3.946587537091988e-07, "loss": 0.5385, "mean_token_accuracy": 0.8289787769317627, "num_tokens": 35488120.0, "step": 932 }, { "epoch": 0.11868718992494594, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.958632469177246, "learning_rate": 3.9508266214497666e-07, "loss": 0.5111, "mean_token_accuracy": 0.8379837274551392, "num_tokens": 35531759.0, "step": 933 }, { "epoch": 0.11881440020353645, "ewc_loss": 0.011474609375, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.62939453125e-06, "grad_norm": 12.572432518005371, "learning_rate": 3.9550657058075456e-07, "loss": 0.496, "mean_token_accuracy": 0.8448067307472229, "num_tokens": 35571476.0, "step": 934 }, { "epoch": 0.11894161048212695, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.157451629638672, "learning_rate": 3.959304790165324e-07, "loss": 0.5366, "mean_token_accuracy": 0.8330051898956299, "num_tokens": 35608440.0, "step": 935 }, { "epoch": 0.11906882076071747, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.773957252502441, "learning_rate": 3.9635438745231025e-07, "loss": 0.5325, "mean_token_accuracy": 0.8336445093154907, "num_tokens": 35641861.0, "step": 936 }, { "epoch": 0.11919603103930798, "ewc_loss": 0.0115966796875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.748603820800781e-06, "grad_norm": 12.881539344787598, "learning_rate": 3.9677829588808815e-07, "loss": 0.555, "mean_token_accuracy": 0.825073778629303, "num_tokens": 35684626.0, "step": 937 }, { "epoch": 0.11932324131789848, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.873146057128906, "learning_rate": 3.9720220432386605e-07, "loss": 0.5598, "mean_token_accuracy": 0.8222572207450867, "num_tokens": 35718125.0, "step": 938 }, { "epoch": 0.119450451596489, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.208464622497559, "learning_rate": 3.976261127596439e-07, "loss": 0.5051, "mean_token_accuracy": 0.8428986072540283, "num_tokens": 35752818.0, "step": 939 }, { "epoch": 0.11957766187507951, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.051243782043457, "learning_rate": 3.9805002119542174e-07, "loss": 0.484, "mean_token_accuracy": 0.8465799689292908, "num_tokens": 35787097.0, "step": 940 }, { "epoch": 0.11970487215367001, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.851151466369629, "learning_rate": 3.9847392963119964e-07, "loss": 0.5836, "mean_token_accuracy": 0.8205495476722717, "num_tokens": 35825887.0, "step": 941 }, { "epoch": 0.11983208243226053, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.156721115112305, "learning_rate": 3.9889783806697754e-07, "loss": 0.5084, "mean_token_accuracy": 0.8424810767173767, "num_tokens": 35863592.0, "step": 942 }, { "epoch": 0.11995929271085104, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.814697265625e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.939685821533203, "learning_rate": 3.993217465027554e-07, "loss": 0.5859, "mean_token_accuracy": 0.8206856846809387, "num_tokens": 35904483.0, "step": 943 }, { "epoch": 0.12008650298944154, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.931375503540039, "learning_rate": 3.9974565493853323e-07, "loss": 0.4715, "mean_token_accuracy": 0.8514784574508667, "num_tokens": 35938662.0, "step": 944 }, { "epoch": 0.12021371326803205, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.794090270996094, "learning_rate": 4.0016956337431113e-07, "loss": 0.5366, "mean_token_accuracy": 0.8312833905220032, "num_tokens": 35975176.0, "step": 945 }, { "epoch": 0.12034092354662257, "ewc_loss": 0.01171875, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 13.269091606140137, "learning_rate": 4.0059347181008903e-07, "loss": 0.5227, "mean_token_accuracy": 0.8352881669998169, "num_tokens": 36008893.0, "step": 946 }, { "epoch": 0.12046813382521308, "ewc_loss": 0.01177978515625, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.684890747070312, "learning_rate": 4.010173802458669e-07, "loss": 0.4816, "mean_token_accuracy": 0.8502214550971985, "num_tokens": 36049882.0, "step": 947 }, { "epoch": 0.12059534410380358, "ewc_loss": 0.01177978515625, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.867813110351562e-06, "grad_norm": 12.992188453674316, "learning_rate": 4.014412886816447e-07, "loss": 0.5959, "mean_token_accuracy": 0.8173194527626038, "num_tokens": 36090615.0, "step": 948 }, { "epoch": 0.1207225543823941, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.890888214111328, "learning_rate": 4.018651971174226e-07, "loss": 0.5259, "mean_token_accuracy": 0.8326645493507385, "num_tokens": 36128159.0, "step": 949 }, { "epoch": 0.12084976466098461, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.835762977600098, "learning_rate": 4.022891055532005e-07, "loss": 0.5384, "mean_token_accuracy": 0.8306065201759338, "num_tokens": 36172851.0, "step": 950 }, { "epoch": 0.12097697493957511, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 13.017278671264648, "learning_rate": 4.0271301398897837e-07, "loss": 0.59, "mean_token_accuracy": 0.8201778531074524, "num_tokens": 36214282.0, "step": 951 }, { "epoch": 0.12110418521816563, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 13.010309219360352, "learning_rate": 4.031369224247562e-07, "loss": 0.5109, "mean_token_accuracy": 0.8350275754928589, "num_tokens": 36248619.0, "step": 952 }, { "epoch": 0.12123139549675614, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.758271217346191, "learning_rate": 4.035608308605341e-07, "loss": 0.5481, "mean_token_accuracy": 0.8270847797393799, "num_tokens": 36289070.0, "step": 953 }, { "epoch": 0.12135860577534664, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.96235466003418, "learning_rate": 4.03984739296312e-07, "loss": 0.5468, "mean_token_accuracy": 0.8304659128189087, "num_tokens": 36330883.0, "step": 954 }, { "epoch": 0.12148581605393716, "ewc_loss": 0.011962890625, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 7.987022399902344e-06, "grad_norm": 13.052408218383789, "learning_rate": 4.044086477320898e-07, "loss": 0.5266, "mean_token_accuracy": 0.834330677986145, "num_tokens": 36371507.0, "step": 955 }, { "epoch": 0.12161302633252767, "ewc_loss": 0.01190185546875, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.987022399902344e-06, "grad_norm": 12.785602569580078, "learning_rate": 4.048325561678677e-07, "loss": 0.55, "mean_token_accuracy": 0.8271427154541016, "num_tokens": 36407262.0, "step": 956 }, { "epoch": 0.12174023661111817, "ewc_loss": 0.0118408203125, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 7.927417755126953e-06, "grad_norm": 12.80382251739502, "learning_rate": 4.052564646036456e-07, "loss": 0.5451, "mean_token_accuracy": 0.8297480344772339, "num_tokens": 36448089.0, "step": 957 }, { "epoch": 0.12186744688970869, "ewc_loss": 0.011962890625, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 8.046627044677734e-06, "grad_norm": 12.973552703857422, "learning_rate": 4.056803730394235e-07, "loss": 0.5592, "mean_token_accuracy": 0.8217591047286987, "num_tokens": 36484022.0, "step": 958 }, { "epoch": 0.1219946571682992, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 12.908764839172363, "learning_rate": 4.061042814752013e-07, "loss": 0.5346, "mean_token_accuracy": 0.8334211111068726, "num_tokens": 36519385.0, "step": 959 }, { "epoch": 0.12212186744688971, "ewc_loss": 0.011962890625, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.046627044677734e-06, "grad_norm": 12.929858207702637, "learning_rate": 4.065281899109792e-07, "loss": 0.5379, "mean_token_accuracy": 0.8299497365951538, "num_tokens": 36551943.0, "step": 960 }, { "epoch": 0.12224907772548022, "ewc_loss": 0.011962890625, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 7.987022399902344e-06, "grad_norm": 13.095626831054688, "learning_rate": 4.069520983467571e-07, "loss": 0.4833, "mean_token_accuracy": 0.8475015759468079, "num_tokens": 36590927.0, "step": 961 }, { "epoch": 0.12237628800407073, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.018567085266113, "learning_rate": 4.07376006782535e-07, "loss": 0.5183, "mean_token_accuracy": 0.8355298042297363, "num_tokens": 36625199.0, "step": 962 }, { "epoch": 0.12250349828266124, "ewc_loss": 0.011962890625, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.046627044677734e-06, "grad_norm": 13.009511947631836, "learning_rate": 4.077999152183128e-07, "loss": 0.4788, "mean_token_accuracy": 0.847293496131897, "num_tokens": 36658981.0, "step": 963 }, { "epoch": 0.12263070856125174, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 12.835295677185059, "learning_rate": 4.082238236540907e-07, "loss": 0.5392, "mean_token_accuracy": 0.8340455293655396, "num_tokens": 36704214.0, "step": 964 }, { "epoch": 0.12275791883984226, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.225440979003906e-06, "grad_norm": 13.12817668914795, "learning_rate": 4.086477320898686e-07, "loss": 0.5083, "mean_token_accuracy": 0.8388471007347107, "num_tokens": 36744782.0, "step": 965 }, { "epoch": 0.12288512911843277, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.131216049194336, "learning_rate": 4.090716405256465e-07, "loss": 0.5444, "mean_token_accuracy": 0.827117919921875, "num_tokens": 36783645.0, "step": 966 }, { "epoch": 0.12301233939702327, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.102487564086914, "learning_rate": 4.094955489614243e-07, "loss": 0.5124, "mean_token_accuracy": 0.8377892971038818, "num_tokens": 36817539.0, "step": 967 }, { "epoch": 0.12313954967561379, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.055144309997559, "learning_rate": 4.099194573972022e-07, "loss": 0.5134, "mean_token_accuracy": 0.8380559682846069, "num_tokens": 36859144.0, "step": 968 }, { "epoch": 0.1232667599542043, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.147160530090332, "learning_rate": 4.1034336583298007e-07, "loss": 0.5559, "mean_token_accuracy": 0.8257086873054504, "num_tokens": 36900833.0, "step": 969 }, { "epoch": 0.1233939702327948, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.225440979003906e-06, "grad_norm": 12.977739334106445, "learning_rate": 4.1076727426875797e-07, "loss": 0.5465, "mean_token_accuracy": 0.8285338878631592, "num_tokens": 36940935.0, "step": 970 }, { "epoch": 0.12352118051138532, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.106231689453125e-06, "grad_norm": 13.225570678710938, "learning_rate": 4.1119118270453577e-07, "loss": 0.6088, "mean_token_accuracy": 0.8091160655021667, "num_tokens": 36979874.0, "step": 971 }, { "epoch": 0.12364839078997583, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 8.285045623779297e-06, "grad_norm": 13.210860252380371, "learning_rate": 4.1161509114031366e-07, "loss": 0.4893, "mean_token_accuracy": 0.8468879461288452, "num_tokens": 37017917.0, "step": 972 }, { "epoch": 0.12377560106856635, "ewc_loss": 0.01226806640625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.285045623779297e-06, "grad_norm": 13.091877937316895, "learning_rate": 4.1203899957609156e-07, "loss": 0.5536, "mean_token_accuracy": 0.8258739113807678, "num_tokens": 37054914.0, "step": 973 }, { "epoch": 0.12390281134715685, "ewc_loss": 0.0120849609375, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.106231689453125e-06, "grad_norm": 13.143611907958984, "learning_rate": 4.124629080118694e-07, "loss": 0.5038, "mean_token_accuracy": 0.8384486436843872, "num_tokens": 37088277.0, "step": 974 }, { "epoch": 0.12403002162574736, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.225440979003906e-06, "grad_norm": 13.015595436096191, "learning_rate": 4.1288681644764726e-07, "loss": 0.5511, "mean_token_accuracy": 0.8257683515548706, "num_tokens": 37126963.0, "step": 975 }, { "epoch": 0.12415723190433788, "ewc_loss": 0.01226806640625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.285045623779297e-06, "grad_norm": 13.187284469604492, "learning_rate": 4.1331072488342515e-07, "loss": 0.5646, "mean_token_accuracy": 0.8246421217918396, "num_tokens": 37161948.0, "step": 976 }, { "epoch": 0.12428444218292838, "ewc_loss": 0.01214599609375, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.165836334228516e-06, "grad_norm": 13.083462715148926, "learning_rate": 4.1373463331920305e-07, "loss": 0.4771, "mean_token_accuracy": 0.8498995304107666, "num_tokens": 37198552.0, "step": 977 }, { "epoch": 0.12441165246151889, "ewc_loss": 0.01226806640625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.285045623779297e-06, "grad_norm": 13.32943344116211, "learning_rate": 4.141585417549809e-07, "loss": 0.5489, "mean_token_accuracy": 0.8258777260780334, "num_tokens": 37235990.0, "step": 978 }, { "epoch": 0.1245388627401094, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.16710090637207, "learning_rate": 4.1458245019075875e-07, "loss": 0.5075, "mean_token_accuracy": 0.838835597038269, "num_tokens": 37269664.0, "step": 979 }, { "epoch": 0.1246660730186999, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.225440979003906e-06, "grad_norm": 13.209136009216309, "learning_rate": 4.1500635862653664e-07, "loss": 0.5387, "mean_token_accuracy": 0.8323867917060852, "num_tokens": 37308647.0, "step": 980 }, { "epoch": 0.12479328329729042, "ewc_loss": 0.01226806640625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.285045623779297e-06, "grad_norm": 13.113018035888672, "learning_rate": 4.1543026706231454e-07, "loss": 0.4881, "mean_token_accuracy": 0.8466149568557739, "num_tokens": 37345953.0, "step": 981 }, { "epoch": 0.12492049357588093, "ewc_loss": 0.01220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.225440979003906e-06, "grad_norm": 13.254107475280762, "learning_rate": 4.158541754980924e-07, "loss": 0.484, "mean_token_accuracy": 0.8494855761528015, "num_tokens": 37382735.0, "step": 982 }, { "epoch": 0.12504770385447145, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.12292194366455, "learning_rate": 4.1627808393387024e-07, "loss": 0.5509, "mean_token_accuracy": 0.8251611590385437, "num_tokens": 37419605.0, "step": 983 }, { "epoch": 0.12517491413306195, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.124815940856934, "learning_rate": 4.1670199236964813e-07, "loss": 0.5439, "mean_token_accuracy": 0.8244583606719971, "num_tokens": 37455395.0, "step": 984 }, { "epoch": 0.12530212441165245, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.199508666992188, "learning_rate": 4.1712590080542603e-07, "loss": 0.4944, "mean_token_accuracy": 0.8441084623336792, "num_tokens": 37492208.0, "step": 985 }, { "epoch": 0.12542933469024298, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.201693534851074, "learning_rate": 4.175498092412039e-07, "loss": 0.5412, "mean_token_accuracy": 0.8333337306976318, "num_tokens": 37533675.0, "step": 986 }, { "epoch": 0.12555654496883348, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.190503120422363, "learning_rate": 4.179737176769817e-07, "loss": 0.5067, "mean_token_accuracy": 0.8412196636199951, "num_tokens": 37572792.0, "step": 987 }, { "epoch": 0.12568375524742398, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.150529861450195, "learning_rate": 4.183976261127596e-07, "loss": 0.5024, "mean_token_accuracy": 0.840084433555603, "num_tokens": 37611481.0, "step": 988 }, { "epoch": 0.1258109655260145, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.27706527709961, "learning_rate": 4.1882153454853747e-07, "loss": 0.5811, "mean_token_accuracy": 0.8182045221328735, "num_tokens": 37648253.0, "step": 989 }, { "epoch": 0.125938175804605, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.033914566040039, "learning_rate": 4.1924544298431537e-07, "loss": 0.5133, "mean_token_accuracy": 0.8395153284072876, "num_tokens": 37687086.0, "step": 990 }, { "epoch": 0.12606538608319554, "ewc_loss": 0.0123291015625, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.344650268554688e-06, "grad_norm": 13.16437816619873, "learning_rate": 4.196693514200932e-07, "loss": 0.5079, "mean_token_accuracy": 0.8406796455383301, "num_tokens": 37730928.0, "step": 991 }, { "epoch": 0.12619259636178604, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.17085075378418, "learning_rate": 4.200932598558711e-07, "loss": 0.5336, "mean_token_accuracy": 0.8288521766662598, "num_tokens": 37771857.0, "step": 992 }, { "epoch": 0.12631980664037654, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.2720308303833, "learning_rate": 4.2051716829164896e-07, "loss": 0.5364, "mean_token_accuracy": 0.8336935043334961, "num_tokens": 37817517.0, "step": 993 }, { "epoch": 0.12644701691896706, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.226765632629395, "learning_rate": 4.2094107672742686e-07, "loss": 0.5792, "mean_token_accuracy": 0.8229519128799438, "num_tokens": 37855891.0, "step": 994 }, { "epoch": 0.12657422719755757, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.106306076049805, "learning_rate": 4.2136498516320476e-07, "loss": 0.5013, "mean_token_accuracy": 0.8440769910812378, "num_tokens": 37888719.0, "step": 995 }, { "epoch": 0.12670143747614807, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.161308288574219, "learning_rate": 4.217888935989826e-07, "loss": 0.5511, "mean_token_accuracy": 0.827609658241272, "num_tokens": 37932968.0, "step": 996 }, { "epoch": 0.1268286477547386, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.31140422821045, "learning_rate": 4.2221280203476045e-07, "loss": 0.5418, "mean_token_accuracy": 0.828741729259491, "num_tokens": 37974297.0, "step": 997 }, { "epoch": 0.1269558580333291, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.260987281799316, "learning_rate": 4.2263671047053835e-07, "loss": 0.56, "mean_token_accuracy": 0.8245208263397217, "num_tokens": 38020294.0, "step": 998 }, { "epoch": 0.1270830683119196, "ewc_loss": 0.01239013671875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.404254913330078e-06, "grad_norm": 13.125192642211914, "learning_rate": 4.2306061890631625e-07, "loss": 0.5391, "mean_token_accuracy": 0.8320172429084778, "num_tokens": 38063322.0, "step": 999 }, { "epoch": 0.12721027859051012, "ewc_loss": 0.01251220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.310461044311523, "learning_rate": 4.234845273420941e-07, "loss": 0.6293, "mean_token_accuracy": 0.8111281394958496, "num_tokens": 38101052.0, "step": 1000 }, { "epoch": 0.12733748886910062, "ewc_loss": 0.0125732421875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.58306884765625e-06, "grad_norm": 13.05538558959961, "learning_rate": 4.2390843577787194e-07, "loss": 0.5473, "mean_token_accuracy": 0.827828049659729, "num_tokens": 38145472.0, "step": 1001 }, { "epoch": 0.12746469914769112, "ewc_loss": 0.01251220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.603178977966309, "learning_rate": 4.2433234421364984e-07, "loss": 0.5235, "mean_token_accuracy": 0.8354980945587158, "num_tokens": 38182662.0, "step": 1002 }, { "epoch": 0.12759190942628165, "ewc_loss": 0.01263427734375, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.521053314208984, "learning_rate": 4.2475625264942774e-07, "loss": 0.5099, "mean_token_accuracy": 0.8409661054611206, "num_tokens": 38224735.0, "step": 1003 }, { "epoch": 0.12771911970487215, "ewc_loss": 0.01251220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.346314430236816, "learning_rate": 4.251801610852056e-07, "loss": 0.554, "mean_token_accuracy": 0.8263649344444275, "num_tokens": 38255847.0, "step": 1004 }, { "epoch": 0.12784632998346265, "ewc_loss": 0.01251220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.300437927246094, "learning_rate": 4.2560406952098343e-07, "loss": 0.5677, "mean_token_accuracy": 0.8238311409950256, "num_tokens": 38296921.0, "step": 1005 }, { "epoch": 0.12797354026205318, "ewc_loss": 0.01251220703125, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.075145721435547, "learning_rate": 4.2602797795676133e-07, "loss": 0.5765, "mean_token_accuracy": 0.8208857774734497, "num_tokens": 38336408.0, "step": 1006 }, { "epoch": 0.12810075054064368, "ewc_loss": 0.0125732421875, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.58306884765625e-06, "grad_norm": 13.62386703491211, "learning_rate": 4.2645188639253923e-07, "loss": 0.484, "mean_token_accuracy": 0.8456598520278931, "num_tokens": 38372809.0, "step": 1007 }, { "epoch": 0.12822796081923418, "ewc_loss": 0.01263427734375, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.517156600952148, "learning_rate": 4.26875794828317e-07, "loss": 0.5438, "mean_token_accuracy": 0.8288984298706055, "num_tokens": 38413312.0, "step": 1008 }, { "epoch": 0.1283551710978247, "ewc_loss": 0.01263427734375, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.328304290771484, "learning_rate": 4.272997032640949e-07, "loss": 0.5667, "mean_token_accuracy": 0.8216148614883423, "num_tokens": 38457710.0, "step": 1009 }, { "epoch": 0.1284823813764152, "ewc_loss": 0.0125732421875, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.52346420288086e-06, "grad_norm": 13.388916969299316, "learning_rate": 4.277236116998728e-07, "loss": 0.5255, "mean_token_accuracy": 0.836185097694397, "num_tokens": 38499630.0, "step": 1010 }, { "epoch": 0.1286095916550057, "ewc_loss": 0.0126953125, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.338030815124512, "learning_rate": 4.281475201356507e-07, "loss": 0.5332, "mean_token_accuracy": 0.8338416814804077, "num_tokens": 38538133.0, "step": 1011 }, { "epoch": 0.12873680193359624, "ewc_loss": 0.0128173828125, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.544771194458008, "learning_rate": 4.285714285714285e-07, "loss": 0.567, "mean_token_accuracy": 0.8240352272987366, "num_tokens": 38576816.0, "step": 1012 }, { "epoch": 0.12886401221218674, "ewc_loss": 0.0128173828125, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.306028366088867, "learning_rate": 4.289953370072064e-07, "loss": 0.5039, "mean_token_accuracy": 0.8417419791221619, "num_tokens": 38615270.0, "step": 1013 }, { "epoch": 0.12899122249077727, "ewc_loss": 0.01263427734375, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.58306884765625e-06, "grad_norm": 13.402188301086426, "learning_rate": 4.294192454429843e-07, "loss": 0.5566, "mean_token_accuracy": 0.8253488540649414, "num_tokens": 38654173.0, "step": 1014 }, { "epoch": 0.12911843276936777, "ewc_loss": 0.01275634765625, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.372152328491211, "learning_rate": 4.298431538787622e-07, "loss": 0.4415, "mean_token_accuracy": 0.8619498014450073, "num_tokens": 38695758.0, "step": 1015 }, { "epoch": 0.12924564304795827, "ewc_loss": 0.0126953125, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.362354278564453, "learning_rate": 4.3026706231454e-07, "loss": 0.5879, "mean_token_accuracy": 0.8159055709838867, "num_tokens": 38729524.0, "step": 1016 }, { "epoch": 0.1293728533265488, "ewc_loss": 0.0126953125, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.41901683807373, "learning_rate": 4.306909707503179e-07, "loss": 0.5353, "mean_token_accuracy": 0.8353909254074097, "num_tokens": 38767500.0, "step": 1017 }, { "epoch": 0.1295000636051393, "ewc_loss": 0.01275634765625, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.548014640808105, "learning_rate": 4.311148791860958e-07, "loss": 0.5227, "mean_token_accuracy": 0.835126519203186, "num_tokens": 38798683.0, "step": 1018 }, { "epoch": 0.1296272738837298, "ewc_loss": 0.01275634765625, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.408041000366211, "learning_rate": 4.315387876218737e-07, "loss": 0.5722, "mean_token_accuracy": 0.8224744200706482, "num_tokens": 38833653.0, "step": 1019 }, { "epoch": 0.12975448416232033, "ewc_loss": 0.01275634765625, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.64267349243164e-06, "grad_norm": 13.454768180847168, "learning_rate": 4.319626960576515e-07, "loss": 0.5692, "mean_token_accuracy": 0.8195030689239502, "num_tokens": 38869428.0, "step": 1020 }, { "epoch": 0.12988169444091083, "ewc_loss": 0.0128173828125, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.213431358337402, "learning_rate": 4.323866044934294e-07, "loss": 0.5051, "mean_token_accuracy": 0.8373006582260132, "num_tokens": 38909502.0, "step": 1021 }, { "epoch": 0.13000890471950133, "ewc_loss": 0.0128173828125, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.471084594726562, "learning_rate": 4.328105129292073e-07, "loss": 0.5308, "mean_token_accuracy": 0.8348225355148315, "num_tokens": 38951730.0, "step": 1022 }, { "epoch": 0.13013611499809186, "ewc_loss": 0.01287841796875, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.519461631774902, "learning_rate": 4.332344213649852e-07, "loss": 0.5086, "mean_token_accuracy": 0.8366721272468567, "num_tokens": 38988981.0, "step": 1023 }, { "epoch": 0.13026332527668236, "ewc_loss": 0.01287841796875, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.5648775100708, "learning_rate": 4.33658329800763e-07, "loss": 0.5161, "mean_token_accuracy": 0.839350163936615, "num_tokens": 39026289.0, "step": 1024 }, { "epoch": 0.13039053555527286, "ewc_loss": 0.01287841796875, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.525774955749512, "learning_rate": 4.340822382365409e-07, "loss": 0.5074, "mean_token_accuracy": 0.837617039680481, "num_tokens": 39069113.0, "step": 1025 }, { "epoch": 0.13051774583386339, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.589659690856934, "learning_rate": 4.345061466723188e-07, "loss": 0.4913, "mean_token_accuracy": 0.8450495004653931, "num_tokens": 39112991.0, "step": 1026 }, { "epoch": 0.13064495611245389, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.5366792678833, "learning_rate": 4.3493005510809663e-07, "loss": 0.5544, "mean_token_accuracy": 0.827255129814148, "num_tokens": 39149593.0, "step": 1027 }, { "epoch": 0.1307721663910444, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.33548641204834, "learning_rate": 4.353539635438745e-07, "loss": 0.5172, "mean_token_accuracy": 0.8376250267028809, "num_tokens": 39186570.0, "step": 1028 }, { "epoch": 0.13089937666963491, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.813578605651855, "learning_rate": 4.357778719796524e-07, "loss": 0.5807, "mean_token_accuracy": 0.8162316679954529, "num_tokens": 39220666.0, "step": 1029 }, { "epoch": 0.13102658694822542, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.245749473571777, "learning_rate": 4.362017804154303e-07, "loss": 0.5001, "mean_token_accuracy": 0.8440081477165222, "num_tokens": 39258372.0, "step": 1030 }, { "epoch": 0.13115379722681592, "ewc_loss": 0.01287841796875, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 8.702278137207031e-06, "grad_norm": 13.456494331359863, "learning_rate": 4.366256888512081e-07, "loss": 0.4947, "mean_token_accuracy": 0.8449170589447021, "num_tokens": 39293647.0, "step": 1031 }, { "epoch": 0.13128100750540644, "ewc_loss": 0.01300048828125, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.636706352233887, "learning_rate": 4.3704959728698597e-07, "loss": 0.5237, "mean_token_accuracy": 0.8334818482398987, "num_tokens": 39332811.0, "step": 1032 }, { "epoch": 0.13140821778399694, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.492008209228516, "learning_rate": 4.3747350572276386e-07, "loss": 0.5254, "mean_token_accuracy": 0.833219051361084, "num_tokens": 39365971.0, "step": 1033 }, { "epoch": 0.13153542806258745, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.580484390258789, "learning_rate": 4.3789741415854176e-07, "loss": 0.5396, "mean_token_accuracy": 0.8319815397262573, "num_tokens": 39403178.0, "step": 1034 }, { "epoch": 0.13166263834117797, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.548908233642578, "learning_rate": 4.383213225943196e-07, "loss": 0.5283, "mean_token_accuracy": 0.8322332501411438, "num_tokens": 39441303.0, "step": 1035 }, { "epoch": 0.13178984861976847, "ewc_loss": 0.012939453125, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.761882781982422e-06, "grad_norm": 13.607181549072266, "learning_rate": 4.3874523103009746e-07, "loss": 0.4744, "mean_token_accuracy": 0.8507440090179443, "num_tokens": 39477359.0, "step": 1036 }, { "epoch": 0.13191705889835897, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.655893325805664, "learning_rate": 4.3916913946587536e-07, "loss": 0.4832, "mean_token_accuracy": 0.8443608283996582, "num_tokens": 39511687.0, "step": 1037 }, { "epoch": 0.1320442691769495, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.464279174804688, "learning_rate": 4.3959304790165325e-07, "loss": 0.4992, "mean_token_accuracy": 0.8407176733016968, "num_tokens": 39547453.0, "step": 1038 }, { "epoch": 0.13217147945554, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.709733963012695, "learning_rate": 4.400169563374311e-07, "loss": 0.5156, "mean_token_accuracy": 0.8359577655792236, "num_tokens": 39588934.0, "step": 1039 }, { "epoch": 0.13229868973413053, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.660186767578125, "learning_rate": 4.4044086477320895e-07, "loss": 0.6091, "mean_token_accuracy": 0.814638078212738, "num_tokens": 39627638.0, "step": 1040 }, { "epoch": 0.13242590001272103, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.760933876037598, "learning_rate": 4.4086477320898685e-07, "loss": 0.5158, "mean_token_accuracy": 0.8362526893615723, "num_tokens": 39661696.0, "step": 1041 }, { "epoch": 0.13255311029131153, "ewc_loss": 0.01318359375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 13.806381225585938, "learning_rate": 4.4128868164476474e-07, "loss": 0.5242, "mean_token_accuracy": 0.8389006853103638, "num_tokens": 39702357.0, "step": 1042 }, { "epoch": 0.13268032056990206, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.439360618591309, "learning_rate": 4.417125900805426e-07, "loss": 0.5076, "mean_token_accuracy": 0.8411768078804016, "num_tokens": 39744960.0, "step": 1043 }, { "epoch": 0.13280753084849256, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.75659465789795, "learning_rate": 4.4213649851632044e-07, "loss": 0.5669, "mean_token_accuracy": 0.8257632851600647, "num_tokens": 39785394.0, "step": 1044 }, { "epoch": 0.13293474112708306, "ewc_loss": 0.01312255859375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.720730781555176, "learning_rate": 4.4256040695209834e-07, "loss": 0.5472, "mean_token_accuracy": 0.831902027130127, "num_tokens": 39822698.0, "step": 1045 }, { "epoch": 0.1330619514056736, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 14.109107971191406, "learning_rate": 4.429843153878762e-07, "loss": 0.5465, "mean_token_accuracy": 0.8269997835159302, "num_tokens": 39856874.0, "step": 1046 }, { "epoch": 0.1331891616842641, "ewc_loss": 0.01312255859375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 8.940696716308594e-06, "grad_norm": 14.061518669128418, "learning_rate": 4.434082238236541e-07, "loss": 0.5782, "mean_token_accuracy": 0.8183463215827942, "num_tokens": 39899683.0, "step": 1047 }, { "epoch": 0.1333163719628546, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.99772834777832, "learning_rate": 4.4383213225943193e-07, "loss": 0.5557, "mean_token_accuracy": 0.8277170658111572, "num_tokens": 39940565.0, "step": 1048 }, { "epoch": 0.13344358224144512, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.784985542297363, "learning_rate": 4.442560406952098e-07, "loss": 0.555, "mean_token_accuracy": 0.8238385915756226, "num_tokens": 39979010.0, "step": 1049 }, { "epoch": 0.13357079252003562, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.95645523071289, "learning_rate": 4.4467994913098767e-07, "loss": 0.5041, "mean_token_accuracy": 0.8416985273361206, "num_tokens": 40021151.0, "step": 1050 }, { "epoch": 0.13369800279862612, "ewc_loss": 0.01312255859375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 8.940696716308594e-06, "grad_norm": 14.043346405029297, "learning_rate": 4.4510385756676557e-07, "loss": 0.5639, "mean_token_accuracy": 0.8228704929351807, "num_tokens": 40059324.0, "step": 1051 }, { "epoch": 0.13382521307721665, "ewc_loss": 0.01312255859375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 8.881092071533203e-06, "grad_norm": 13.640374183654785, "learning_rate": 4.455277660025434e-07, "loss": 0.519, "mean_token_accuracy": 0.8353607058525085, "num_tokens": 40104577.0, "step": 1052 }, { "epoch": 0.13395242335580715, "ewc_loss": 0.0130615234375, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 8.821487426757812e-06, "grad_norm": 13.988293647766113, "learning_rate": 4.459516744383213e-07, "loss": 0.5434, "mean_token_accuracy": 0.8267378807067871, "num_tokens": 40139734.0, "step": 1053 }, { "epoch": 0.13407963363439765, "ewc_loss": 0.01312255859375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 8.940696716308594e-06, "grad_norm": 13.944300651550293, "learning_rate": 4.4637558287409916e-07, "loss": 0.5396, "mean_token_accuracy": 0.829367458820343, "num_tokens": 40180724.0, "step": 1054 }, { "epoch": 0.13420684391298818, "ewc_loss": 0.01318359375, "ewc_loss_diag": 4.231929779052734e-06, "ewc_loss_parallel": 8.940696716308594e-06, "grad_norm": 13.679781913757324, "learning_rate": 4.4679949130987706e-07, "loss": 0.5377, "mean_token_accuracy": 0.8365930318832397, "num_tokens": 40220475.0, "step": 1055 }, { "epoch": 0.13433405419157868, "ewc_loss": 0.01318359375, "ewc_loss_diag": 4.231929779052734e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 14.014174461364746, "learning_rate": 4.472233997456549e-07, "loss": 0.55, "mean_token_accuracy": 0.8272891044616699, "num_tokens": 40258208.0, "step": 1056 }, { "epoch": 0.13446126447016918, "ewc_loss": 0.01324462890625, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 13.837420463562012, "learning_rate": 4.476473081814328e-07, "loss": 0.5328, "mean_token_accuracy": 0.832004725933075, "num_tokens": 40299127.0, "step": 1057 }, { "epoch": 0.1345884747487597, "ewc_loss": 0.01318359375, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 13.996908187866211, "learning_rate": 4.4807121661721065e-07, "loss": 0.4912, "mean_token_accuracy": 0.8429940342903137, "num_tokens": 40339331.0, "step": 1058 }, { "epoch": 0.1347156850273502, "ewc_loss": 0.01324462890625, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 13.877835273742676, "learning_rate": 4.4849512505298855e-07, "loss": 0.5356, "mean_token_accuracy": 0.8306616544723511, "num_tokens": 40379386.0, "step": 1059 }, { "epoch": 0.1348428953059407, "ewc_loss": 0.0133056640625, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.059906005859375e-06, "grad_norm": 14.145063400268555, "learning_rate": 4.489190334887664e-07, "loss": 0.5745, "mean_token_accuracy": 0.8266246318817139, "num_tokens": 40419868.0, "step": 1060 }, { "epoch": 0.13497010558453124, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 13.760791778564453, "learning_rate": 4.493429419245443e-07, "loss": 0.5419, "mean_token_accuracy": 0.8319472670555115, "num_tokens": 40455346.0, "step": 1061 }, { "epoch": 0.13509731586312174, "ewc_loss": 0.01324462890625, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.000301361083984e-06, "grad_norm": 14.054133415222168, "learning_rate": 4.4976685036032214e-07, "loss": 0.5004, "mean_token_accuracy": 0.8409319519996643, "num_tokens": 40490087.0, "step": 1062 }, { "epoch": 0.13522452614171224, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 14.09246826171875, "learning_rate": 4.5019075879610004e-07, "loss": 0.5582, "mean_token_accuracy": 0.8214964866638184, "num_tokens": 40524799.0, "step": 1063 }, { "epoch": 0.13535173642030277, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 13.913860321044922, "learning_rate": 4.506146672318779e-07, "loss": 0.5321, "mean_token_accuracy": 0.8325191736221313, "num_tokens": 40563740.0, "step": 1064 }, { "epoch": 0.13547894669889327, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 14.146245956420898, "learning_rate": 4.5103857566765573e-07, "loss": 0.5851, "mean_token_accuracy": 0.8228479623794556, "num_tokens": 40604743.0, "step": 1065 }, { "epoch": 0.1356061569774838, "ewc_loss": 0.013427734375, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.179115295410156e-06, "grad_norm": 13.879386901855469, "learning_rate": 4.5146248410343363e-07, "loss": 0.4915, "mean_token_accuracy": 0.8456520438194275, "num_tokens": 40640909.0, "step": 1066 }, { "epoch": 0.1357333672560743, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 14.268946647644043, "learning_rate": 4.5188639253921153e-07, "loss": 0.4984, "mean_token_accuracy": 0.8442409634590149, "num_tokens": 40678930.0, "step": 1067 }, { "epoch": 0.1358605775346648, "ewc_loss": 0.01348876953125, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.238719940185547e-06, "grad_norm": 13.649614334106445, "learning_rate": 4.523103009749894e-07, "loss": 0.4927, "mean_token_accuracy": 0.8439589142799377, "num_tokens": 40715609.0, "step": 1068 }, { "epoch": 0.13598778781325532, "ewc_loss": 0.01336669921875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.119510650634766e-06, "grad_norm": 14.29908561706543, "learning_rate": 4.527342094107672e-07, "loss": 0.5167, "mean_token_accuracy": 0.8360244035720825, "num_tokens": 40760533.0, "step": 1069 }, { "epoch": 0.13611499809184582, "ewc_loss": 0.0135498046875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.298324584960938e-06, "grad_norm": 13.93738842010498, "learning_rate": 4.531581178465451e-07, "loss": 0.544, "mean_token_accuracy": 0.8300901651382446, "num_tokens": 40800039.0, "step": 1070 }, { "epoch": 0.13624220837043632, "ewc_loss": 0.013427734375, "ewc_loss_diag": 4.291534423828125e-06, "ewc_loss_parallel": 9.179115295410156e-06, "grad_norm": 13.89501953125, "learning_rate": 4.53582026282323e-07, "loss": 0.5083, "mean_token_accuracy": 0.8401533365249634, "num_tokens": 40840207.0, "step": 1071 }, { "epoch": 0.13636941864902685, "ewc_loss": 0.0135498046875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.298324584960938e-06, "grad_norm": 14.014512062072754, "learning_rate": 4.5400593471810087e-07, "loss": 0.5902, "mean_token_accuracy": 0.8159552812576294, "num_tokens": 40879559.0, "step": 1072 }, { "epoch": 0.13649662892761735, "ewc_loss": 0.0135498046875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.298324584960938e-06, "grad_norm": 14.042389869689941, "learning_rate": 4.544298431538787e-07, "loss": 0.5176, "mean_token_accuracy": 0.8382166624069214, "num_tokens": 40919608.0, "step": 1073 }, { "epoch": 0.13662383920620785, "ewc_loss": 0.0135498046875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.298324584960938e-06, "grad_norm": 14.386383056640625, "learning_rate": 4.548537515896566e-07, "loss": 0.5786, "mean_token_accuracy": 0.8186221122741699, "num_tokens": 40953648.0, "step": 1074 }, { "epoch": 0.13675104948479838, "ewc_loss": 0.013671875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.417533874511719e-06, "grad_norm": 13.998343467712402, "learning_rate": 4.552776600254345e-07, "loss": 0.5498, "mean_token_accuracy": 0.8281778693199158, "num_tokens": 40989675.0, "step": 1075 }, { "epoch": 0.13687825976338888, "ewc_loss": 0.0135498046875, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 9.298324584960938e-06, "grad_norm": 14.078272819519043, "learning_rate": 4.5570156846121236e-07, "loss": 0.5002, "mean_token_accuracy": 0.841949462890625, "num_tokens": 41027992.0, "step": 1076 }, { "epoch": 0.13700547004197938, "ewc_loss": 0.0137939453125, "ewc_loss_diag": 4.291534423828125e-06, "ewc_loss_parallel": 9.47713851928711e-06, "grad_norm": 13.939103126525879, "learning_rate": 4.561254768969902e-07, "loss": 0.5004, "mean_token_accuracy": 0.8403940200805664, "num_tokens": 41066925.0, "step": 1077 }, { "epoch": 0.1371326803205699, "ewc_loss": 0.013671875, "ewc_loss_diag": 4.291534423828125e-06, "ewc_loss_parallel": 9.417533874511719e-06, "grad_norm": 14.065768241882324, "learning_rate": 4.565493853327681e-07, "loss": 0.5098, "mean_token_accuracy": 0.8407308459281921, "num_tokens": 41105090.0, "step": 1078 }, { "epoch": 0.1372598905991604, "ewc_loss": 0.013671875, "ewc_loss_diag": 4.291534423828125e-06, "ewc_loss_parallel": 9.417533874511719e-06, "grad_norm": 14.036941528320312, "learning_rate": 4.56973293768546e-07, "loss": 0.5511, "mean_token_accuracy": 0.8311270475387573, "num_tokens": 41141924.0, "step": 1079 }, { "epoch": 0.1373871008777509, "ewc_loss": 0.0137939453125, "ewc_loss_diag": 4.291534423828125e-06, "ewc_loss_parallel": 9.47713851928711e-06, "grad_norm": 14.062832832336426, "learning_rate": 4.573972022043238e-07, "loss": 0.6143, "mean_token_accuracy": 0.8085561394691467, "num_tokens": 41181512.0, "step": 1080 }, { "epoch": 0.13751431115634144, "ewc_loss": 0.0137939453125, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.47713851928711e-06, "grad_norm": 14.060259819030762, "learning_rate": 4.578211106401017e-07, "loss": 0.5616, "mean_token_accuracy": 0.8239794373512268, "num_tokens": 41224015.0, "step": 1081 }, { "epoch": 0.13764152143493194, "ewc_loss": 0.013916015625, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.59634780883789e-06, "grad_norm": 13.96065616607666, "learning_rate": 4.582450190758796e-07, "loss": 0.5632, "mean_token_accuracy": 0.8259084224700928, "num_tokens": 41265568.0, "step": 1082 }, { "epoch": 0.13776873171352244, "ewc_loss": 0.0137939453125, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.47713851928711e-06, "grad_norm": 13.856904983520508, "learning_rate": 4.586689275116575e-07, "loss": 0.4748, "mean_token_accuracy": 0.8505141735076904, "num_tokens": 41303595.0, "step": 1083 }, { "epoch": 0.13789594199211297, "ewc_loss": 0.01385498046875, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.5367431640625e-06, "grad_norm": 13.775609016418457, "learning_rate": 4.590928359474353e-07, "loss": 0.581, "mean_token_accuracy": 0.8212592601776123, "num_tokens": 41340119.0, "step": 1084 }, { "epoch": 0.13802315227070347, "ewc_loss": 0.01385498046875, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.5367431640625e-06, "grad_norm": 14.047802925109863, "learning_rate": 4.595167443832132e-07, "loss": 0.5315, "mean_token_accuracy": 0.8364169597625732, "num_tokens": 41380184.0, "step": 1085 }, { "epoch": 0.13815036254929397, "ewc_loss": 0.01385498046875, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.5367431640625e-06, "grad_norm": 13.85025691986084, "learning_rate": 4.599406528189911e-07, "loss": 0.4737, "mean_token_accuracy": 0.8492233157157898, "num_tokens": 41419038.0, "step": 1086 }, { "epoch": 0.1382775728278845, "ewc_loss": 0.01397705078125, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 14.067602157592773, "learning_rate": 4.60364561254769e-07, "loss": 0.5191, "mean_token_accuracy": 0.8334627151489258, "num_tokens": 41455767.0, "step": 1087 }, { "epoch": 0.138404783106475, "ewc_loss": 0.01397705078125, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 13.977843284606934, "learning_rate": 4.607884696905468e-07, "loss": 0.5152, "mean_token_accuracy": 0.8361966013908386, "num_tokens": 41497493.0, "step": 1088 }, { "epoch": 0.1385319933850655, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.351139068603516e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 14.045021057128906, "learning_rate": 4.612123781263247e-07, "loss": 0.5458, "mean_token_accuracy": 0.828667402267456, "num_tokens": 41542940.0, "step": 1089 }, { "epoch": 0.13865920366365603, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 9.715557098388672e-06, "grad_norm": 14.044729232788086, "learning_rate": 4.616362865621026e-07, "loss": 0.4464, "mean_token_accuracy": 0.8550859689712524, "num_tokens": 41579302.0, "step": 1090 }, { "epoch": 0.13878641394224653, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 14.061552047729492, "learning_rate": 4.620601949978805e-07, "loss": 0.5463, "mean_token_accuracy": 0.8302582502365112, "num_tokens": 41617555.0, "step": 1091 }, { "epoch": 0.13891362422083706, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 13.930404663085938, "learning_rate": 4.6248410343365827e-07, "loss": 0.5956, "mean_token_accuracy": 0.81229168176651, "num_tokens": 41654660.0, "step": 1092 }, { "epoch": 0.13904083449942756, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.351139068603516e-06, "ewc_loss_parallel": 9.715557098388672e-06, "grad_norm": 14.180981636047363, "learning_rate": 4.6290801186943617e-07, "loss": 0.5034, "mean_token_accuracy": 0.8412021398544312, "num_tokens": 41693773.0, "step": 1093 }, { "epoch": 0.13916804477801806, "ewc_loss": 0.0140380859375, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.655952453613281e-06, "grad_norm": 14.051992416381836, "learning_rate": 4.6333192030521407e-07, "loss": 0.5182, "mean_token_accuracy": 0.8407208919525146, "num_tokens": 41728864.0, "step": 1094 }, { "epoch": 0.13929525505660859, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.775161743164062e-06, "grad_norm": 13.950199127197266, "learning_rate": 4.6375582874099196e-07, "loss": 0.4927, "mean_token_accuracy": 0.8443199396133423, "num_tokens": 41770596.0, "step": 1095 }, { "epoch": 0.1394224653351991, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.775161743164062e-06, "grad_norm": 14.12194538116455, "learning_rate": 4.6417973717676976e-07, "loss": 0.5123, "mean_token_accuracy": 0.841890811920166, "num_tokens": 41806585.0, "step": 1096 }, { "epoch": 0.1395496756137896, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.351139068603516e-06, "ewc_loss_parallel": 9.775161743164062e-06, "grad_norm": 14.171680450439453, "learning_rate": 4.6460364561254766e-07, "loss": 0.5392, "mean_token_accuracy": 0.8299619555473328, "num_tokens": 41845488.0, "step": 1097 }, { "epoch": 0.13967688589238011, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.351139068603516e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.105278015136719, "learning_rate": 4.6502755404832556e-07, "loss": 0.5048, "mean_token_accuracy": 0.8394224643707275, "num_tokens": 41880674.0, "step": 1098 }, { "epoch": 0.13980409617097062, "ewc_loss": 0.01422119140625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.118724822998047, "learning_rate": 4.654514624841034e-07, "loss": 0.5194, "mean_token_accuracy": 0.8361563682556152, "num_tokens": 41913020.0, "step": 1099 }, { "epoch": 0.13993130644956112, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.351139068603516e-06, "ewc_loss_parallel": 9.775161743164062e-06, "grad_norm": 14.044062614440918, "learning_rate": 4.6587537091988125e-07, "loss": 0.5252, "mean_token_accuracy": 0.8367093801498413, "num_tokens": 41948483.0, "step": 1100 }, { "epoch": 0.14005851672815164, "ewc_loss": 0.01416015625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.775161743164062e-06, "grad_norm": 14.126307487487793, "learning_rate": 4.6629927935565915e-07, "loss": 0.4956, "mean_token_accuracy": 0.8444228768348694, "num_tokens": 41985659.0, "step": 1101 }, { "epoch": 0.14018572700674214, "ewc_loss": 0.01422119140625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 13.991446495056152, "learning_rate": 4.6672318779143705e-07, "loss": 0.5948, "mean_token_accuracy": 0.8158910274505615, "num_tokens": 42027746.0, "step": 1102 }, { "epoch": 0.14031293728533265, "ewc_loss": 0.01422119140625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.16645622253418, "learning_rate": 4.671470962272149e-07, "loss": 0.5508, "mean_token_accuracy": 0.825783908367157, "num_tokens": 42060903.0, "step": 1103 }, { "epoch": 0.14044014756392317, "ewc_loss": 0.01422119140625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 13.949676513671875, "learning_rate": 4.6757100466299274e-07, "loss": 0.5448, "mean_token_accuracy": 0.8322815895080566, "num_tokens": 42102722.0, "step": 1104 }, { "epoch": 0.14056735784251367, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.509332656860352, "learning_rate": 4.6799491309877064e-07, "loss": 0.5542, "mean_token_accuracy": 0.8247443437576294, "num_tokens": 42143418.0, "step": 1105 }, { "epoch": 0.14069456812110417, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.329431533813477, "learning_rate": 4.6841882153454854e-07, "loss": 0.4936, "mean_token_accuracy": 0.8418985605239868, "num_tokens": 42177313.0, "step": 1106 }, { "epoch": 0.1408217783996947, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.136672019958496, "learning_rate": 4.688427299703264e-07, "loss": 0.4718, "mean_token_accuracy": 0.8477938175201416, "num_tokens": 42214158.0, "step": 1107 }, { "epoch": 0.1409489886782852, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.183865547180176, "learning_rate": 4.6926663840610423e-07, "loss": 0.5316, "mean_token_accuracy": 0.8337136507034302, "num_tokens": 42254189.0, "step": 1108 }, { "epoch": 0.1410761989568757, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.37426471710205, "learning_rate": 4.6969054684188213e-07, "loss": 0.554, "mean_token_accuracy": 0.8238476514816284, "num_tokens": 42296853.0, "step": 1109 }, { "epoch": 0.14120340923546623, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.271623611450195, "learning_rate": 4.7011445527766003e-07, "loss": 0.5244, "mean_token_accuracy": 0.831980288028717, "num_tokens": 42331983.0, "step": 1110 }, { "epoch": 0.14133061951405673, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.03363037109375, "learning_rate": 4.7053836371343787e-07, "loss": 0.533, "mean_token_accuracy": 0.8348536491394043, "num_tokens": 42370496.0, "step": 1111 }, { "epoch": 0.14145782979264723, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.266997337341309, "learning_rate": 4.709622721492157e-07, "loss": 0.5127, "mean_token_accuracy": 0.8381249904632568, "num_tokens": 42406489.0, "step": 1112 }, { "epoch": 0.14158504007123776, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.031710624694824, "learning_rate": 4.713861805849936e-07, "loss": 0.5112, "mean_token_accuracy": 0.8373712301254272, "num_tokens": 42440679.0, "step": 1113 }, { "epoch": 0.14171225034982826, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.834766387939453e-06, "grad_norm": 14.1624116897583, "learning_rate": 4.718100890207715e-07, "loss": 0.5854, "mean_token_accuracy": 0.8165203332901001, "num_tokens": 42476699.0, "step": 1114 }, { "epoch": 0.1418394606284188, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.36794376373291, "learning_rate": 4.7223399745654936e-07, "loss": 0.5411, "mean_token_accuracy": 0.8310468792915344, "num_tokens": 42513348.0, "step": 1115 }, { "epoch": 0.1419666709070093, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.104776382446289, "learning_rate": 4.726579058923272e-07, "loss": 0.532, "mean_token_accuracy": 0.8349165916442871, "num_tokens": 42550633.0, "step": 1116 }, { "epoch": 0.1420938811855998, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.283455848693848, "learning_rate": 4.730818143281051e-07, "loss": 0.5559, "mean_token_accuracy": 0.8290227055549622, "num_tokens": 42593649.0, "step": 1117 }, { "epoch": 0.14222109146419032, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.203495025634766, "learning_rate": 4.7350572276388295e-07, "loss": 0.5039, "mean_token_accuracy": 0.8415383696556091, "num_tokens": 42632379.0, "step": 1118 }, { "epoch": 0.14234830174278082, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.090574264526367, "learning_rate": 4.7392963119966085e-07, "loss": 0.5299, "mean_token_accuracy": 0.8330061435699463, "num_tokens": 42670319.0, "step": 1119 }, { "epoch": 0.14247551202137132, "ewc_loss": 0.0142822265625, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 9.894371032714844e-06, "grad_norm": 14.274347305297852, "learning_rate": 4.7435353963543875e-07, "loss": 0.5229, "mean_token_accuracy": 0.8368383646011353, "num_tokens": 42712680.0, "step": 1120 }, { "epoch": 0.14260272229996185, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.372735023498535, "learning_rate": 4.747774480712166e-07, "loss": 0.5402, "mean_token_accuracy": 0.833003580570221, "num_tokens": 42748241.0, "step": 1121 }, { "epoch": 0.14272993257855235, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.239508628845215, "learning_rate": 4.7520135650699444e-07, "loss": 0.5612, "mean_token_accuracy": 0.82559734582901, "num_tokens": 42783208.0, "step": 1122 }, { "epoch": 0.14285714285714285, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.491668701171875, "learning_rate": 4.7562526494277234e-07, "loss": 0.5353, "mean_token_accuracy": 0.8336951732635498, "num_tokens": 42818570.0, "step": 1123 }, { "epoch": 0.14298435313573338, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.216540336608887, "learning_rate": 4.7604917337855024e-07, "loss": 0.4742, "mean_token_accuracy": 0.8481279611587524, "num_tokens": 42852891.0, "step": 1124 }, { "epoch": 0.14311156341432388, "ewc_loss": 0.0145263671875, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.305797576904297, "learning_rate": 4.764730818143281e-07, "loss": 0.525, "mean_token_accuracy": 0.8292113542556763, "num_tokens": 42887359.0, "step": 1125 }, { "epoch": 0.14323877369291438, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.38389778137207, "learning_rate": 4.768969902501059e-07, "loss": 0.578, "mean_token_accuracy": 0.8189996480941772, "num_tokens": 42929308.0, "step": 1126 }, { "epoch": 0.1433659839715049, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.064985275268555, "learning_rate": 4.773208986858838e-07, "loss": 0.5267, "mean_token_accuracy": 0.8328102827072144, "num_tokens": 42969373.0, "step": 1127 }, { "epoch": 0.1434931942500954, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.266865730285645, "learning_rate": 4.777448071216617e-07, "loss": 0.5307, "mean_token_accuracy": 0.834437906742096, "num_tokens": 43010133.0, "step": 1128 }, { "epoch": 0.1436204045286859, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.239044189453125, "learning_rate": 4.781687155574396e-07, "loss": 0.5584, "mean_token_accuracy": 0.8255102634429932, "num_tokens": 43053743.0, "step": 1129 }, { "epoch": 0.14374761480727644, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.288572311401367, "learning_rate": 4.785926239932175e-07, "loss": 0.4976, "mean_token_accuracy": 0.8425619006156921, "num_tokens": 43094125.0, "step": 1130 }, { "epoch": 0.14387482508586694, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.098669052124023, "learning_rate": 4.790165324289953e-07, "loss": 0.5247, "mean_token_accuracy": 0.8328493237495422, "num_tokens": 43132399.0, "step": 1131 }, { "epoch": 0.14400203536445744, "ewc_loss": 0.014404296875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 9.953975677490234e-06, "grad_norm": 14.314045906066895, "learning_rate": 4.794404408647732e-07, "loss": 0.5393, "mean_token_accuracy": 0.8323943018913269, "num_tokens": 43169036.0, "step": 1132 }, { "epoch": 0.14412924564304797, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.163896560668945, "learning_rate": 4.798643493005511e-07, "loss": 0.4674, "mean_token_accuracy": 0.8506137132644653, "num_tokens": 43209397.0, "step": 1133 }, { "epoch": 0.14425645592163847, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.281494140625, "learning_rate": 4.80288257736329e-07, "loss": 0.5669, "mean_token_accuracy": 0.8241052627563477, "num_tokens": 43246788.0, "step": 1134 }, { "epoch": 0.14438366620022897, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.16870403289795, "learning_rate": 4.807121661721068e-07, "loss": 0.4768, "mean_token_accuracy": 0.8482987880706787, "num_tokens": 43282774.0, "step": 1135 }, { "epoch": 0.1445108764788195, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.130599975585938, "learning_rate": 4.811360746078847e-07, "loss": 0.5355, "mean_token_accuracy": 0.8368265628814697, "num_tokens": 43322303.0, "step": 1136 }, { "epoch": 0.14463808675741, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.166619300842285, "learning_rate": 4.815599830436625e-07, "loss": 0.5273, "mean_token_accuracy": 0.835049569606781, "num_tokens": 43354611.0, "step": 1137 }, { "epoch": 0.1447652970360005, "ewc_loss": 0.0146484375, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0192394256591797e-05, "grad_norm": 14.391348838806152, "learning_rate": 4.819838914794405e-07, "loss": 0.5087, "mean_token_accuracy": 0.8392479419708252, "num_tokens": 43390640.0, "step": 1138 }, { "epoch": 0.14489250731459102, "ewc_loss": 0.0145263671875, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0073184967041016e-05, "grad_norm": 14.12742805480957, "learning_rate": 4.824077999152183e-07, "loss": 0.5302, "mean_token_accuracy": 0.8300734758377075, "num_tokens": 43424513.0, "step": 1139 }, { "epoch": 0.14501971759318152, "ewc_loss": 0.01446533203125, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0013580322265625e-05, "grad_norm": 14.31124496459961, "learning_rate": 4.828317083509962e-07, "loss": 0.5361, "mean_token_accuracy": 0.8313794732093811, "num_tokens": 43461699.0, "step": 1140 }, { "epoch": 0.14514692787177205, "ewc_loss": 0.0145263671875, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0073184967041016e-05, "grad_norm": 14.195483207702637, "learning_rate": 4.83255616786774e-07, "loss": 0.5094, "mean_token_accuracy": 0.8437018990516663, "num_tokens": 43494681.0, "step": 1141 }, { "epoch": 0.14527413815036255, "ewc_loss": 0.0146484375, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0132789611816406e-05, "grad_norm": 14.500996589660645, "learning_rate": 4.83679525222552e-07, "loss": 0.4848, "mean_token_accuracy": 0.8476076126098633, "num_tokens": 43531318.0, "step": 1142 }, { "epoch": 0.14540134842895305, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0311603546142578e-05, "grad_norm": 14.179645538330078, "learning_rate": 4.841034336583298e-07, "loss": 0.5001, "mean_token_accuracy": 0.8468791842460632, "num_tokens": 43569991.0, "step": 1143 }, { "epoch": 0.14552855870754358, "ewc_loss": 0.0145263671875, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0073184967041016e-05, "grad_norm": 14.682931900024414, "learning_rate": 4.845273420941076e-07, "loss": 0.4878, "mean_token_accuracy": 0.8487036228179932, "num_tokens": 43609978.0, "step": 1144 }, { "epoch": 0.14565576898613408, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.334203720092773, "learning_rate": 4.849512505298855e-07, "loss": 0.5386, "mean_token_accuracy": 0.8272590637207031, "num_tokens": 43640419.0, "step": 1145 }, { "epoch": 0.14578297926472458, "ewc_loss": 0.01458740234375, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0073184967041016e-05, "grad_norm": 14.400516510009766, "learning_rate": 4.853751589656634e-07, "loss": 0.498, "mean_token_accuracy": 0.8439509868621826, "num_tokens": 43682408.0, "step": 1146 }, { "epoch": 0.1459101895433151, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0311603546142578e-05, "grad_norm": 14.477022171020508, "learning_rate": 4.857990674014413e-07, "loss": 0.5388, "mean_token_accuracy": 0.8294633626937866, "num_tokens": 43726158.0, "step": 1147 }, { "epoch": 0.1460373998219056, "ewc_loss": 0.01458740234375, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0132789611816406e-05, "grad_norm": 14.492533683776855, "learning_rate": 4.862229758372191e-07, "loss": 0.5696, "mean_token_accuracy": 0.8198502063751221, "num_tokens": 43766466.0, "step": 1148 }, { "epoch": 0.1461646101004961, "ewc_loss": 0.01458740234375, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.0132789611816406e-05, "grad_norm": 14.233721733093262, "learning_rate": 4.86646884272997e-07, "loss": 0.5535, "mean_token_accuracy": 0.824191153049469, "num_tokens": 43807255.0, "step": 1149 }, { "epoch": 0.14629182037908664, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0251998901367188e-05, "grad_norm": 14.369954109191895, "learning_rate": 4.870707927087749e-07, "loss": 0.5334, "mean_token_accuracy": 0.831551194190979, "num_tokens": 43845419.0, "step": 1150 }, { "epoch": 0.14641903065767714, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.45308780670166, "learning_rate": 4.874947011445528e-07, "loss": 0.5422, "mean_token_accuracy": 0.8327443599700928, "num_tokens": 43882942.0, "step": 1151 }, { "epoch": 0.14654624093626764, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.204557418823242, "learning_rate": 4.879186095803306e-07, "loss": 0.4598, "mean_token_accuracy": 0.8550150394439697, "num_tokens": 43923109.0, "step": 1152 }, { "epoch": 0.14667345121485817, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0311603546142578e-05, "grad_norm": 14.580567359924316, "learning_rate": 4.883425180161085e-07, "loss": 0.4977, "mean_token_accuracy": 0.8417669534683228, "num_tokens": 43959752.0, "step": 1153 }, { "epoch": 0.14680066149344867, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.500150680541992e-06, "ewc_loss_parallel": 1.0311603546142578e-05, "grad_norm": 14.306449890136719, "learning_rate": 4.887664264518864e-07, "loss": 0.5469, "mean_token_accuracy": 0.8337694406509399, "num_tokens": 43996247.0, "step": 1154 }, { "epoch": 0.14692787177203917, "ewc_loss": 0.0147705078125, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0251998901367188e-05, "grad_norm": 14.487767219543457, "learning_rate": 4.891903348876643e-07, "loss": 0.5267, "mean_token_accuracy": 0.8380612730979919, "num_tokens": 44035751.0, "step": 1155 }, { "epoch": 0.1470550820506297, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.240650177001953, "learning_rate": 4.896142433234421e-07, "loss": 0.5625, "mean_token_accuracy": 0.8296316862106323, "num_tokens": 44073225.0, "step": 1156 }, { "epoch": 0.1471822923292202, "ewc_loss": 0.01483154296875, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.0311603546142578e-05, "grad_norm": 14.512064933776855, "learning_rate": 4.9003815175922e-07, "loss": 0.566, "mean_token_accuracy": 0.8290818929672241, "num_tokens": 44111761.0, "step": 1157 }, { "epoch": 0.1473095026078107, "ewc_loss": 0.01495361328125, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.043081283569336e-05, "grad_norm": 14.373546600341797, "learning_rate": 4.904620601949979e-07, "loss": 0.5167, "mean_token_accuracy": 0.8379993438720703, "num_tokens": 44151311.0, "step": 1158 }, { "epoch": 0.14743671288640123, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.446853637695312, "learning_rate": 4.908859686307758e-07, "loss": 0.5638, "mean_token_accuracy": 0.8266273736953735, "num_tokens": 44184623.0, "step": 1159 }, { "epoch": 0.14756392316499173, "ewc_loss": 0.014892578125, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.0371208190917969e-05, "grad_norm": 14.4514799118042, "learning_rate": 4.913098770665536e-07, "loss": 0.5476, "mean_token_accuracy": 0.8317809700965881, "num_tokens": 44221753.0, "step": 1160 }, { "epoch": 0.14769113344358223, "ewc_loss": 0.0150146484375, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.043081283569336e-05, "grad_norm": 14.409136772155762, "learning_rate": 4.917337855023314e-07, "loss": 0.501, "mean_token_accuracy": 0.838993489742279, "num_tokens": 44262542.0, "step": 1161 }, { "epoch": 0.14781834372217276, "ewc_loss": 0.0150146484375, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.623173713684082, "learning_rate": 4.921576939381094e-07, "loss": 0.5246, "mean_token_accuracy": 0.8402289748191833, "num_tokens": 44300294.0, "step": 1162 }, { "epoch": 0.14794555400076326, "ewc_loss": 0.0150146484375, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.524316787719727, "learning_rate": 4.925816023738872e-07, "loss": 0.5701, "mean_token_accuracy": 0.8216396570205688, "num_tokens": 44338476.0, "step": 1163 }, { "epoch": 0.14807276427935376, "ewc_loss": 0.01507568359375, "ewc_loss_diag": 4.589557647705078e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.720763206481934, "learning_rate": 4.930055108096651e-07, "loss": 0.4735, "mean_token_accuracy": 0.8493294715881348, "num_tokens": 44367437.0, "step": 1164 }, { "epoch": 0.1481999745579443, "ewc_loss": 0.01513671875, "ewc_loss_diag": 4.6193599700927734e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.366470336914062, "learning_rate": 4.934294192454429e-07, "loss": 0.5821, "mean_token_accuracy": 0.8193807601928711, "num_tokens": 44402636.0, "step": 1165 }, { "epoch": 0.1483271848365348, "ewc_loss": 0.01513671875, "ewc_loss_diag": 4.6193599700927734e-06, "ewc_loss_parallel": 1.055002212524414e-05, "grad_norm": 14.618782997131348, "learning_rate": 4.938533276812209e-07, "loss": 0.5243, "mean_token_accuracy": 0.837138295173645, "num_tokens": 44438187.0, "step": 1166 }, { "epoch": 0.14845439511512531, "ewc_loss": 0.0152587890625, "ewc_loss_diag": 4.649162292480469e-06, "ewc_loss_parallel": 1.0609626770019531e-05, "grad_norm": 14.478403091430664, "learning_rate": 4.942772361169987e-07, "loss": 0.4484, "mean_token_accuracy": 0.854153037071228, "num_tokens": 44478345.0, "step": 1167 }, { "epoch": 0.14858160539371582, "ewc_loss": 0.01513671875, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.330970764160156, "learning_rate": 4.947011445527766e-07, "loss": 0.5291, "mean_token_accuracy": 0.8345724940299988, "num_tokens": 44520272.0, "step": 1168 }, { "epoch": 0.14870881567230632, "ewc_loss": 0.01513671875, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.049041748046875e-05, "grad_norm": 14.461043357849121, "learning_rate": 4.951250529885544e-07, "loss": 0.4778, "mean_token_accuracy": 0.8508554697036743, "num_tokens": 44561058.0, "step": 1169 }, { "epoch": 0.14883602595089684, "ewc_loss": 0.01531982421875, "ewc_loss_diag": 4.649162292480469e-06, "ewc_loss_parallel": 1.0669231414794922e-05, "grad_norm": 14.559883117675781, "learning_rate": 4.955489614243324e-07, "loss": 0.6135, "mean_token_accuracy": 0.8070751428604126, "num_tokens": 44599371.0, "step": 1170 }, { "epoch": 0.14896323622948734, "ewc_loss": 0.0152587890625, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.0609626770019531e-05, "grad_norm": 14.454099655151367, "learning_rate": 4.959728698601102e-07, "loss": 0.5226, "mean_token_accuracy": 0.8284562826156616, "num_tokens": 44640499.0, "step": 1171 }, { "epoch": 0.14909044650807785, "ewc_loss": 0.015380859375, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.0669231414794922e-05, "grad_norm": 14.531438827514648, "learning_rate": 4.963967782958881e-07, "loss": 0.5092, "mean_token_accuracy": 0.8405438661575317, "num_tokens": 44683164.0, "step": 1172 }, { "epoch": 0.14921765678666837, "ewc_loss": 0.01544189453125, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0728836059570312e-05, "grad_norm": 14.597230911254883, "learning_rate": 4.968206867316659e-07, "loss": 0.5251, "mean_token_accuracy": 0.8331338167190552, "num_tokens": 44721938.0, "step": 1173 }, { "epoch": 0.14934486706525887, "ewc_loss": 0.0152587890625, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.055002212524414e-05, "grad_norm": 14.416801452636719, "learning_rate": 4.972445951674439e-07, "loss": 0.4921, "mean_token_accuracy": 0.8418247699737549, "num_tokens": 44753506.0, "step": 1174 }, { "epoch": 0.14947207734384937, "ewc_loss": 0.01544189453125, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0728836059570312e-05, "grad_norm": 14.64697265625, "learning_rate": 4.976685036032216e-07, "loss": 0.5844, "mean_token_accuracy": 0.8171365261077881, "num_tokens": 44792170.0, "step": 1175 }, { "epoch": 0.1495992876224399, "ewc_loss": 0.01531982421875, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0609626770019531e-05, "grad_norm": 14.418170928955078, "learning_rate": 4.980924120389996e-07, "loss": 0.4587, "mean_token_accuracy": 0.8568191528320312, "num_tokens": 44830733.0, "step": 1176 }, { "epoch": 0.1497264979010304, "ewc_loss": 0.01544189453125, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0728836059570312e-05, "grad_norm": 14.634132385253906, "learning_rate": 4.985163204747774e-07, "loss": 0.4748, "mean_token_accuracy": 0.850172221660614, "num_tokens": 44869743.0, "step": 1177 }, { "epoch": 0.1498537081796209, "ewc_loss": 0.0155029296875, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.0848045349121094e-05, "grad_norm": 14.54803466796875, "learning_rate": 4.989402289105554e-07, "loss": 0.5212, "mean_token_accuracy": 0.8413918018341064, "num_tokens": 44913358.0, "step": 1178 }, { "epoch": 0.14998091845821143, "ewc_loss": 0.015380859375, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0669231414794922e-05, "grad_norm": 14.642070770263672, "learning_rate": 4.993641373463331e-07, "loss": 0.5811, "mean_token_accuracy": 0.8180536031723022, "num_tokens": 44949963.0, "step": 1179 }, { "epoch": 0.15010812873680193, "ewc_loss": 0.015625, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0907649993896484e-05, "grad_norm": 14.714838981628418, "learning_rate": 4.997880457821111e-07, "loss": 0.5368, "mean_token_accuracy": 0.8296976685523987, "num_tokens": 44988569.0, "step": 1180 }, { "epoch": 0.15023533901539243, "ewc_loss": 0.01544189453125, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0728836059570312e-05, "grad_norm": 14.382752418518066, "learning_rate": 5.002119542178889e-07, "loss": 0.512, "mean_token_accuracy": 0.8381154537200928, "num_tokens": 45032634.0, "step": 1181 }, { "epoch": 0.15036254929398296, "ewc_loss": 0.01556396484375, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0848045349121094e-05, "grad_norm": 14.673807144165039, "learning_rate": 5.006358626536667e-07, "loss": 0.48, "mean_token_accuracy": 0.8496896028518677, "num_tokens": 45069685.0, "step": 1182 }, { "epoch": 0.15048975957257346, "ewc_loss": 0.015625, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0967254638671875e-05, "grad_norm": 14.785754203796387, "learning_rate": 5.010597710894446e-07, "loss": 0.503, "mean_token_accuracy": 0.8407443165779114, "num_tokens": 45107481.0, "step": 1183 }, { "epoch": 0.15061696985116396, "ewc_loss": 0.015625, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0967254638671875e-05, "grad_norm": 14.69351577758789, "learning_rate": 5.014836795252225e-07, "loss": 0.4872, "mean_token_accuracy": 0.8461894392967224, "num_tokens": 45145747.0, "step": 1184 }, { "epoch": 0.1507441801297545, "ewc_loss": 0.015625, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.0967254638671875e-05, "grad_norm": 14.58951187133789, "learning_rate": 5.019075879610004e-07, "loss": 0.495, "mean_token_accuracy": 0.8468567728996277, "num_tokens": 45183140.0, "step": 1185 }, { "epoch": 0.150871390408345, "ewc_loss": 0.015625, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.0848045349121094e-05, "grad_norm": 14.733979225158691, "learning_rate": 5.023314963967783e-07, "loss": 0.5281, "mean_token_accuracy": 0.833043098449707, "num_tokens": 45224911.0, "step": 1186 }, { "epoch": 0.1509986006869355, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.1086463928222656e-05, "grad_norm": 14.70071029663086, "learning_rate": 5.027554048325562e-07, "loss": 0.5855, "mean_token_accuracy": 0.8185832500457764, "num_tokens": 45257388.0, "step": 1187 }, { "epoch": 0.15112581096552602, "ewc_loss": 0.0157470703125, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.1026859283447266e-05, "grad_norm": 14.651129722595215, "learning_rate": 5.03179313268334e-07, "loss": 0.5703, "mean_token_accuracy": 0.8216910362243652, "num_tokens": 45297876.0, "step": 1188 }, { "epoch": 0.15125302124411652, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.1146068572998047e-05, "grad_norm": 14.739189147949219, "learning_rate": 5.036032217041119e-07, "loss": 0.5441, "mean_token_accuracy": 0.8273770213127136, "num_tokens": 45334040.0, "step": 1189 }, { "epoch": 0.15138023152270705, "ewc_loss": 0.0157470703125, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.1026859283447266e-05, "grad_norm": 14.579178810119629, "learning_rate": 5.040271301398897e-07, "loss": 0.5293, "mean_token_accuracy": 0.8313897848129272, "num_tokens": 45375985.0, "step": 1190 }, { "epoch": 0.15150744180129755, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.1146068572998047e-05, "grad_norm": 14.965836524963379, "learning_rate": 5.044510385756676e-07, "loss": 0.4978, "mean_token_accuracy": 0.8403863310813904, "num_tokens": 45419951.0, "step": 1191 }, { "epoch": 0.15163465207988805, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.1146068572998047e-05, "grad_norm": 14.548623085021973, "learning_rate": 5.048749470114455e-07, "loss": 0.5115, "mean_token_accuracy": 0.8405579328536987, "num_tokens": 45456988.0, "step": 1192 }, { "epoch": 0.15176186235847858, "ewc_loss": 0.0157470703125, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.0967254638671875e-05, "grad_norm": 14.820503234863281, "learning_rate": 5.052988554472234e-07, "loss": 0.5289, "mean_token_accuracy": 0.8313192129135132, "num_tokens": 45488226.0, "step": 1193 }, { "epoch": 0.15188907263706908, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.1086463928222656e-05, "grad_norm": 14.711389541625977, "learning_rate": 5.057227638830013e-07, "loss": 0.5264, "mean_token_accuracy": 0.8360628485679626, "num_tokens": 45524661.0, "step": 1194 }, { "epoch": 0.15201628291565958, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.1146068572998047e-05, "grad_norm": 14.863753318786621, "learning_rate": 5.061466723187792e-07, "loss": 0.5284, "mean_token_accuracy": 0.8360157012939453, "num_tokens": 45563491.0, "step": 1195 }, { "epoch": 0.1521434931942501, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.1086463928222656e-05, "grad_norm": 14.847747802734375, "learning_rate": 5.065705807545569e-07, "loss": 0.4987, "mean_token_accuracy": 0.8442034125328064, "num_tokens": 45596771.0, "step": 1196 }, { "epoch": 0.1522707034728406, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.1086463928222656e-05, "grad_norm": 14.63113784790039, "learning_rate": 5.069944891903349e-07, "loss": 0.5175, "mean_token_accuracy": 0.8373385667800903, "num_tokens": 45640049.0, "step": 1197 }, { "epoch": 0.1523979137514311, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.798173904418945e-06, "ewc_loss_parallel": 1.1026859283447266e-05, "grad_norm": 14.941112518310547, "learning_rate": 5.074183976261127e-07, "loss": 0.5174, "mean_token_accuracy": 0.8365400433540344, "num_tokens": 45677375.0, "step": 1198 }, { "epoch": 0.15252512403002164, "ewc_loss": 0.0159912109375, "ewc_loss_diag": 4.798173904418945e-06, "ewc_loss_parallel": 1.1205673217773438e-05, "grad_norm": 14.559541702270508, "learning_rate": 5.078423060618906e-07, "loss": 0.5206, "mean_token_accuracy": 0.8379011750221252, "num_tokens": 45719253.0, "step": 1199 }, { "epoch": 0.15265233430861214, "ewc_loss": 0.015869140625, "ewc_loss_diag": 4.798173904418945e-06, "ewc_loss_parallel": 1.1026859283447266e-05, "grad_norm": 14.807512283325195, "learning_rate": 5.082662144976685e-07, "loss": 0.5182, "mean_token_accuracy": 0.833380937576294, "num_tokens": 45756065.0, "step": 1200 }, { "epoch": 0.15277954458720264, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.798173904418945e-06, "ewc_loss_parallel": 1.1324882507324219e-05, "grad_norm": 14.63829231262207, "learning_rate": 5.086901229334464e-07, "loss": 0.5056, "mean_token_accuracy": 0.8438808917999268, "num_tokens": 45791707.0, "step": 1201 }, { "epoch": 0.15290675486579317, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.798173904418945e-06, "ewc_loss_parallel": 1.1265277862548828e-05, "grad_norm": 14.886007308959961, "learning_rate": 5.091140313692243e-07, "loss": 0.5209, "mean_token_accuracy": 0.8354830741882324, "num_tokens": 45828464.0, "step": 1202 }, { "epoch": 0.15303396514438367, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1265277862548828e-05, "grad_norm": 14.64502239227295, "learning_rate": 5.095379398050022e-07, "loss": 0.5041, "mean_token_accuracy": 0.8403699398040771, "num_tokens": 45867046.0, "step": 1203 }, { "epoch": 0.15316117542297417, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1265277862548828e-05, "grad_norm": 14.795753479003906, "learning_rate": 5.099618482407799e-07, "loss": 0.5363, "mean_token_accuracy": 0.833095908164978, "num_tokens": 45900530.0, "step": 1204 }, { "epoch": 0.1532883857015647, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1324882507324219e-05, "grad_norm": 14.545273780822754, "learning_rate": 5.103857566765578e-07, "loss": 0.5149, "mean_token_accuracy": 0.840040922164917, "num_tokens": 45940002.0, "step": 1205 }, { "epoch": 0.1534155959801552, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.656956672668457, "learning_rate": 5.108096651123357e-07, "loss": 0.4748, "mean_token_accuracy": 0.8503320217132568, "num_tokens": 45975299.0, "step": 1206 }, { "epoch": 0.1535428062587457, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1265277862548828e-05, "grad_norm": 14.898573875427246, "learning_rate": 5.112335735481135e-07, "loss": 0.5136, "mean_token_accuracy": 0.8355239033699036, "num_tokens": 46012693.0, "step": 1207 }, { "epoch": 0.15367001653733622, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1563301086425781e-05, "grad_norm": 14.666926383972168, "learning_rate": 5.116574819838915e-07, "loss": 0.417, "mean_token_accuracy": 0.8688367605209351, "num_tokens": 46051558.0, "step": 1208 }, { "epoch": 0.15379722681592672, "ewc_loss": 0.01611328125, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1324882507324219e-05, "grad_norm": 14.745588302612305, "learning_rate": 5.120813904196693e-07, "loss": 0.5022, "mean_token_accuracy": 0.8422665596008301, "num_tokens": 46087173.0, "step": 1209 }, { "epoch": 0.15392443709451722, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.779664039611816, "learning_rate": 5.125052988554473e-07, "loss": 0.5581, "mean_token_accuracy": 0.8288800716400146, "num_tokens": 46120482.0, "step": 1210 }, { "epoch": 0.15405164737310775, "ewc_loss": 0.0162353515625, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.138448715209961e-05, "grad_norm": 14.77281665802002, "learning_rate": 5.12929207291225e-07, "loss": 0.5072, "mean_token_accuracy": 0.8387143611907959, "num_tokens": 46154478.0, "step": 1211 }, { "epoch": 0.15417885765169825, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.860568046569824, "learning_rate": 5.133531157270029e-07, "loss": 0.4825, "mean_token_accuracy": 0.8478880524635315, "num_tokens": 46195012.0, "step": 1212 }, { "epoch": 0.15430606793028875, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.768014907836914, "learning_rate": 5.137770241627808e-07, "loss": 0.4462, "mean_token_accuracy": 0.8579670190811157, "num_tokens": 46230641.0, "step": 1213 }, { "epoch": 0.15443327820887928, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.150369644165039e-05, "grad_norm": 14.835715293884277, "learning_rate": 5.142009325985587e-07, "loss": 0.4881, "mean_token_accuracy": 0.8482823371887207, "num_tokens": 46268993.0, "step": 1214 }, { "epoch": 0.15456048848746978, "ewc_loss": 0.0162353515625, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.138448715209961e-05, "grad_norm": 14.785704612731934, "learning_rate": 5.146248410343365e-07, "loss": 0.5218, "mean_token_accuracy": 0.8376026153564453, "num_tokens": 46309310.0, "step": 1215 }, { "epoch": 0.1546876987660603, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.150369644165039e-05, "grad_norm": 15.006339073181152, "learning_rate": 5.150487494701145e-07, "loss": 0.4624, "mean_token_accuracy": 0.8547882437705994, "num_tokens": 46352415.0, "step": 1216 }, { "epoch": 0.1548149090446508, "ewc_loss": 0.0162353515625, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.138448715209961e-05, "grad_norm": 14.843199729919434, "learning_rate": 5.154726579058923e-07, "loss": 0.5184, "mean_token_accuracy": 0.8362956047058105, "num_tokens": 46387838.0, "step": 1217 }, { "epoch": 0.1549421193232413, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.150369644165039e-05, "grad_norm": 14.991092681884766, "learning_rate": 5.158965663416703e-07, "loss": 0.5137, "mean_token_accuracy": 0.839974045753479, "num_tokens": 46432669.0, "step": 1218 }, { "epoch": 0.15506932960183184, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.956192970275879, "learning_rate": 5.16320474777448e-07, "loss": 0.468, "mean_token_accuracy": 0.8521006107330322, "num_tokens": 46474383.0, "step": 1219 }, { "epoch": 0.15519653988042234, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1563301086425781e-05, "grad_norm": 14.742088317871094, "learning_rate": 5.167443832132259e-07, "loss": 0.5197, "mean_token_accuracy": 0.8366752862930298, "num_tokens": 46514263.0, "step": 1220 }, { "epoch": 0.15532375015901284, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1444091796875e-05, "grad_norm": 14.917532920837402, "learning_rate": 5.171682916490038e-07, "loss": 0.4855, "mean_token_accuracy": 0.8468165397644043, "num_tokens": 46549835.0, "step": 1221 }, { "epoch": 0.15545096043760337, "ewc_loss": 0.0164794921875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1622905731201172e-05, "grad_norm": 14.661054611206055, "learning_rate": 5.175922000847816e-07, "loss": 0.4737, "mean_token_accuracy": 0.8517091274261475, "num_tokens": 46584811.0, "step": 1222 }, { "epoch": 0.15557817071619387, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1563301086425781e-05, "grad_norm": 14.814199447631836, "learning_rate": 5.180161085205595e-07, "loss": 0.4949, "mean_token_accuracy": 0.8444592952728271, "num_tokens": 46621441.0, "step": 1223 }, { "epoch": 0.15570538099478437, "ewc_loss": 0.0164794921875, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 1.1682510375976562e-05, "grad_norm": 14.766343116760254, "learning_rate": 5.184400169563374e-07, "loss": 0.5494, "mean_token_accuracy": 0.8284765481948853, "num_tokens": 46661572.0, "step": 1224 }, { "epoch": 0.1558325912733749, "ewc_loss": 0.016357421875, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1563301086425781e-05, "grad_norm": 14.878596305847168, "learning_rate": 5.188639253921153e-07, "loss": 0.524, "mean_token_accuracy": 0.8349918723106384, "num_tokens": 46702157.0, "step": 1225 }, { "epoch": 0.1559598015519654, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1682510375976562e-05, "grad_norm": 14.82972240447998, "learning_rate": 5.192878338278932e-07, "loss": 0.5013, "mean_token_accuracy": 0.8431835174560547, "num_tokens": 46739119.0, "step": 1226 }, { "epoch": 0.1560870118305559, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1682510375976562e-05, "grad_norm": 14.75313949584961, "learning_rate": 5.19711742263671e-07, "loss": 0.559, "mean_token_accuracy": 0.8252395391464233, "num_tokens": 46783990.0, "step": 1227 }, { "epoch": 0.15621422210914643, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1801719665527344e-05, "grad_norm": 14.854636192321777, "learning_rate": 5.201356506994488e-07, "loss": 0.4949, "mean_token_accuracy": 0.8396945595741272, "num_tokens": 46816131.0, "step": 1228 }, { "epoch": 0.15634143238773693, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1742115020751953e-05, "grad_norm": 14.797308921813965, "learning_rate": 5.205595591352268e-07, "loss": 0.5815, "mean_token_accuracy": 0.8203587532043457, "num_tokens": 46856453.0, "step": 1229 }, { "epoch": 0.15646864266632743, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.887580871582031e-06, "ewc_loss_parallel": 1.1742115020751953e-05, "grad_norm": 14.907387733459473, "learning_rate": 5.209834675710046e-07, "loss": 0.5191, "mean_token_accuracy": 0.8367325067520142, "num_tokens": 46897961.0, "step": 1230 }, { "epoch": 0.15659585294491796, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 1.1682510375976562e-05, "grad_norm": 14.834285736083984, "learning_rate": 5.214073760067825e-07, "loss": 0.5947, "mean_token_accuracy": 0.8159980177879333, "num_tokens": 46931961.0, "step": 1231 }, { "epoch": 0.15672306322350846, "ewc_loss": 0.0167236328125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1801719665527344e-05, "grad_norm": 14.91752815246582, "learning_rate": 5.218312844425604e-07, "loss": 0.515, "mean_token_accuracy": 0.8409507274627686, "num_tokens": 46967822.0, "step": 1232 }, { "epoch": 0.15685027350209896, "ewc_loss": 0.0166015625, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1742115020751953e-05, "grad_norm": 14.830972671508789, "learning_rate": 5.222551928783383e-07, "loss": 0.4944, "mean_token_accuracy": 0.844752311706543, "num_tokens": 47004511.0, "step": 1233 }, { "epoch": 0.1569774837806895, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.83701229095459, "learning_rate": 5.226791013141161e-07, "loss": 0.5476, "mean_token_accuracy": 0.8318904638290405, "num_tokens": 47047244.0, "step": 1234 }, { "epoch": 0.15710469405928, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.93821907043457, "learning_rate": 5.23103009749894e-07, "loss": 0.506, "mean_token_accuracy": 0.8414307236671448, "num_tokens": 47088206.0, "step": 1235 }, { "epoch": 0.1572319043378705, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.861795425415039, "learning_rate": 5.235269181856718e-07, "loss": 0.5146, "mean_token_accuracy": 0.8372195363044739, "num_tokens": 47132733.0, "step": 1236 }, { "epoch": 0.15735911461646102, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1861324310302734e-05, "grad_norm": 14.970032691955566, "learning_rate": 5.239508266214498e-07, "loss": 0.573, "mean_token_accuracy": 0.81810462474823, "num_tokens": 47166840.0, "step": 1237 }, { "epoch": 0.15748632489505152, "ewc_loss": 0.0167236328125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1801719665527344e-05, "grad_norm": 14.807957649230957, "learning_rate": 5.243747350572276e-07, "loss": 0.5319, "mean_token_accuracy": 0.835039496421814, "num_tokens": 47203444.0, "step": 1238 }, { "epoch": 0.15761353517364202, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.958066940307617, "learning_rate": 5.247986434930056e-07, "loss": 0.5257, "mean_token_accuracy": 0.8384204506874084, "num_tokens": 47240380.0, "step": 1239 }, { "epoch": 0.15774074545223254, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.845577239990234, "learning_rate": 5.252225519287834e-07, "loss": 0.5868, "mean_token_accuracy": 0.8170253038406372, "num_tokens": 47277886.0, "step": 1240 }, { "epoch": 0.15786795573082305, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 15.257224082946777, "learning_rate": 5.256464603645613e-07, "loss": 0.5141, "mean_token_accuracy": 0.842921257019043, "num_tokens": 47315261.0, "step": 1241 }, { "epoch": 0.15799516600941357, "ewc_loss": 0.0169677734375, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.2040138244628906e-05, "grad_norm": 14.975544929504395, "learning_rate": 5.260703688003391e-07, "loss": 0.5122, "mean_token_accuracy": 0.8392321467399597, "num_tokens": 47357489.0, "step": 1242 }, { "epoch": 0.15812237628800407, "ewc_loss": 0.016845703125, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 15.039578437805176, "learning_rate": 5.26494277236117e-07, "loss": 0.531, "mean_token_accuracy": 0.8348017334938049, "num_tokens": 47389851.0, "step": 1243 }, { "epoch": 0.15824958656659457, "ewc_loss": 0.0169677734375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.1980533599853516e-05, "grad_norm": 15.23077392578125, "learning_rate": 5.269181856718948e-07, "loss": 0.5562, "mean_token_accuracy": 0.8249115943908691, "num_tokens": 47427491.0, "step": 1244 }, { "epoch": 0.1583767968451851, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.2040138244628906e-05, "grad_norm": 15.02670669555664, "learning_rate": 5.273420941076727e-07, "loss": 0.4954, "mean_token_accuracy": 0.8455678224563599, "num_tokens": 47462790.0, "step": 1245 }, { "epoch": 0.1585040071237756, "ewc_loss": 0.016845703125, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.1920928955078125e-05, "grad_norm": 14.904025077819824, "learning_rate": 5.277660025434506e-07, "loss": 0.4577, "mean_token_accuracy": 0.8550437688827515, "num_tokens": 47502240.0, "step": 1246 }, { "epoch": 0.1586312174023661, "ewc_loss": 0.016845703125, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.1861324310302734e-05, "grad_norm": 15.057760238647461, "learning_rate": 5.281899109792285e-07, "loss": 0.4831, "mean_token_accuracy": 0.8475693464279175, "num_tokens": 47542493.0, "step": 1247 }, { "epoch": 0.15875842768095663, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.2040138244628906e-05, "grad_norm": 15.116632461547852, "learning_rate": 5.286138194150064e-07, "loss": 0.5206, "mean_token_accuracy": 0.8396997451782227, "num_tokens": 47583378.0, "step": 1248 }, { "epoch": 0.15888563795954713, "ewc_loss": 0.0169677734375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.1980533599853516e-05, "grad_norm": 15.197105407714844, "learning_rate": 5.290377278507841e-07, "loss": 0.5445, "mean_token_accuracy": 0.8347632884979248, "num_tokens": 47622671.0, "step": 1249 }, { "epoch": 0.15901284823813763, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.2099742889404297e-05, "grad_norm": 15.066831588745117, "learning_rate": 5.294616362865621e-07, "loss": 0.5049, "mean_token_accuracy": 0.8452638387680054, "num_tokens": 47663774.0, "step": 1250 }, { "epoch": 0.15914005851672816, "ewc_loss": 0.0169677734375, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 1.1980533599853516e-05, "grad_norm": 15.108490943908691, "learning_rate": 5.298855447223399e-07, "loss": 0.5759, "mean_token_accuracy": 0.8296111226081848, "num_tokens": 47706143.0, "step": 1251 }, { "epoch": 0.15926726879531866, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2099742889404297e-05, "grad_norm": 15.120285034179688, "learning_rate": 5.303094531581178e-07, "loss": 0.5387, "mean_token_accuracy": 0.8329029083251953, "num_tokens": 47741197.0, "step": 1252 }, { "epoch": 0.15939447907390916, "ewc_loss": 0.0169677734375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.1980533599853516e-05, "grad_norm": 15.137640953063965, "learning_rate": 5.307333615938957e-07, "loss": 0.4603, "mean_token_accuracy": 0.8546515107154846, "num_tokens": 47779306.0, "step": 1253 }, { "epoch": 0.1595216893524997, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2099742889404297e-05, "grad_norm": 15.28049087524414, "learning_rate": 5.311572700296736e-07, "loss": 0.5459, "mean_token_accuracy": 0.8307528495788574, "num_tokens": 47811056.0, "step": 1254 }, { "epoch": 0.1596488996310902, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.065610885620117, "learning_rate": 5.315811784654515e-07, "loss": 0.4536, "mean_token_accuracy": 0.85326087474823, "num_tokens": 47847075.0, "step": 1255 }, { "epoch": 0.1597761099096807, "ewc_loss": 0.01708984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2099742889404297e-05, "grad_norm": 15.207586288452148, "learning_rate": 5.320050869012294e-07, "loss": 0.5107, "mean_token_accuracy": 0.8418941497802734, "num_tokens": 47892639.0, "step": 1256 }, { "epoch": 0.15990332018827122, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2159347534179688e-05, "grad_norm": 15.14215087890625, "learning_rate": 5.324289953370071e-07, "loss": 0.5692, "mean_token_accuracy": 0.8235561847686768, "num_tokens": 47933522.0, "step": 1257 }, { "epoch": 0.16003053046686172, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2159347534179688e-05, "grad_norm": 15.293041229248047, "learning_rate": 5.328529037727851e-07, "loss": 0.5056, "mean_token_accuracy": 0.840160608291626, "num_tokens": 47977307.0, "step": 1258 }, { "epoch": 0.16015774074545222, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2159347534179688e-05, "grad_norm": 15.42672348022461, "learning_rate": 5.332768122085629e-07, "loss": 0.5024, "mean_token_accuracy": 0.8415102958679199, "num_tokens": 48015129.0, "step": 1259 }, { "epoch": 0.16028495102404275, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2159347534179688e-05, "grad_norm": 15.203083038330078, "learning_rate": 5.337007206443408e-07, "loss": 0.5383, "mean_token_accuracy": 0.8289156556129456, "num_tokens": 48051904.0, "step": 1260 }, { "epoch": 0.16041216130263325, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 14.988325119018555, "learning_rate": 5.341246290801187e-07, "loss": 0.499, "mean_token_accuracy": 0.8460836410522461, "num_tokens": 48094001.0, "step": 1261 }, { "epoch": 0.16053937158122375, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.179352760314941, "learning_rate": 5.345485375158966e-07, "loss": 0.4907, "mean_token_accuracy": 0.843986451625824, "num_tokens": 48134124.0, "step": 1262 }, { "epoch": 0.16066658185981428, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.36871337890625, "learning_rate": 5.349724459516745e-07, "loss": 0.481, "mean_token_accuracy": 0.8468632698059082, "num_tokens": 48170730.0, "step": 1263 }, { "epoch": 0.16079379213840478, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.114789962768555, "learning_rate": 5.353963543874522e-07, "loss": 0.5526, "mean_token_accuracy": 0.8287044763565063, "num_tokens": 48212640.0, "step": 1264 }, { "epoch": 0.1609210024169953, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.238353729248047, "learning_rate": 5.358202628232301e-07, "loss": 0.5322, "mean_token_accuracy": 0.8344646096229553, "num_tokens": 48243136.0, "step": 1265 }, { "epoch": 0.1610482126955858, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.339722633361816, "learning_rate": 5.36244171259008e-07, "loss": 0.5089, "mean_token_accuracy": 0.8406801819801331, "num_tokens": 48282905.0, "step": 1266 }, { "epoch": 0.1611754229741763, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.327077865600586, "learning_rate": 5.366680796947859e-07, "loss": 0.547, "mean_token_accuracy": 0.8306783437728882, "num_tokens": 48323325.0, "step": 1267 }, { "epoch": 0.16130263325276684, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.223531723022461, "learning_rate": 5.370919881305637e-07, "loss": 0.5249, "mean_token_accuracy": 0.8387386798858643, "num_tokens": 48358400.0, "step": 1268 }, { "epoch": 0.16142984353135734, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.323226928710938, "learning_rate": 5.375158965663417e-07, "loss": 0.5267, "mean_token_accuracy": 0.8373147249221802, "num_tokens": 48393018.0, "step": 1269 }, { "epoch": 0.16155705380994784, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.301865577697754, "learning_rate": 5.379398050021195e-07, "loss": 0.513, "mean_token_accuracy": 0.840058445930481, "num_tokens": 48432952.0, "step": 1270 }, { "epoch": 0.16168426408853837, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.575940132141113, "learning_rate": 5.383637134378975e-07, "loss": 0.5034, "mean_token_accuracy": 0.839766263961792, "num_tokens": 48466913.0, "step": 1271 }, { "epoch": 0.16181147436712887, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.28980827331543, "learning_rate": 5.387876218736752e-07, "loss": 0.5828, "mean_token_accuracy": 0.8161180019378662, "num_tokens": 48512658.0, "step": 1272 }, { "epoch": 0.16193868464571937, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.492491722106934, "learning_rate": 5.392115303094531e-07, "loss": 0.4792, "mean_token_accuracy": 0.8478972315788269, "num_tokens": 48552283.0, "step": 1273 }, { "epoch": 0.1620658949243099, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.320422172546387, "learning_rate": 5.39635438745231e-07, "loss": 0.4599, "mean_token_accuracy": 0.8557820320129395, "num_tokens": 48590504.0, "step": 1274 }, { "epoch": 0.1621931052029004, "ewc_loss": 0.0172119140625, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.355289459228516, "learning_rate": 5.400593471810089e-07, "loss": 0.5477, "mean_token_accuracy": 0.8299202919006348, "num_tokens": 48629993.0, "step": 1275 }, { "epoch": 0.1623203154814909, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.469154357910156, "learning_rate": 5.404832556167867e-07, "loss": 0.552, "mean_token_accuracy": 0.828399658203125, "num_tokens": 48665074.0, "step": 1276 }, { "epoch": 0.16244752576008142, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.448286056518555, "learning_rate": 5.409071640525647e-07, "loss": 0.5086, "mean_token_accuracy": 0.8438620567321777, "num_tokens": 48705524.0, "step": 1277 }, { "epoch": 0.16257473603867192, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.438539505004883, "learning_rate": 5.413310724883425e-07, "loss": 0.5558, "mean_token_accuracy": 0.8179202079772949, "num_tokens": 48744030.0, "step": 1278 }, { "epoch": 0.16270194631726242, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.233816146850586e-05, "grad_norm": 15.377617835998535, "learning_rate": 5.417549809241205e-07, "loss": 0.5476, "mean_token_accuracy": 0.8327288031578064, "num_tokens": 48788401.0, "step": 1279 }, { "epoch": 0.16282915659585295, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.2218952178955078e-05, "grad_norm": 15.316184997558594, "learning_rate": 5.421788893598982e-07, "loss": 0.5501, "mean_token_accuracy": 0.8289448022842407, "num_tokens": 48823046.0, "step": 1280 }, { "epoch": 0.16295636687444345, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.36988639831543, "learning_rate": 5.42602797795676e-07, "loss": 0.556, "mean_token_accuracy": 0.8252521753311157, "num_tokens": 48868293.0, "step": 1281 }, { "epoch": 0.16308357715303395, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.2278556823730469e-05, "grad_norm": 15.483948707580566, "learning_rate": 5.43026706231454e-07, "loss": 0.5837, "mean_token_accuracy": 0.8184256553649902, "num_tokens": 48908775.0, "step": 1282 }, { "epoch": 0.16321078743162448, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.436182022094727, "learning_rate": 5.434506146672319e-07, "loss": 0.5623, "mean_token_accuracy": 0.8246241211891174, "num_tokens": 48957640.0, "step": 1283 }, { "epoch": 0.16333799771021498, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.233816146850586e-05, "grad_norm": 15.64527702331543, "learning_rate": 5.438745231030097e-07, "loss": 0.5523, "mean_token_accuracy": 0.8250917196273804, "num_tokens": 48993601.0, "step": 1284 }, { "epoch": 0.16346520798880548, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.332365989685059, "learning_rate": 5.442984315387876e-07, "loss": 0.4604, "mean_token_accuracy": 0.8555907011032104, "num_tokens": 49028411.0, "step": 1285 }, { "epoch": 0.163592418267396, "ewc_loss": 0.017333984375, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 1.233816146850586e-05, "grad_norm": 15.623178482055664, "learning_rate": 5.447223399745655e-07, "loss": 0.5224, "mean_token_accuracy": 0.8388717174530029, "num_tokens": 49067449.0, "step": 1286 }, { "epoch": 0.1637196285459865, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.548477172851562, "learning_rate": 5.451462484103433e-07, "loss": 0.4977, "mean_token_accuracy": 0.8445557951927185, "num_tokens": 49102650.0, "step": 1287 }, { "epoch": 0.163846838824577, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.426619529724121, "learning_rate": 5.455701568461212e-07, "loss": 0.5231, "mean_token_accuracy": 0.8346006870269775, "num_tokens": 49137110.0, "step": 1288 }, { "epoch": 0.16397404910316754, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.464337348937988, "learning_rate": 5.45994065281899e-07, "loss": 0.5945, "mean_token_accuracy": 0.813269853591919, "num_tokens": 49177765.0, "step": 1289 }, { "epoch": 0.16410125938175804, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.233816146850586e-05, "grad_norm": 15.728364944458008, "learning_rate": 5.46417973717677e-07, "loss": 0.5343, "mean_token_accuracy": 0.8289595246315002, "num_tokens": 49207860.0, "step": 1290 }, { "epoch": 0.16422846966034857, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.44515323638916, "learning_rate": 5.468418821534548e-07, "loss": 0.4655, "mean_token_accuracy": 0.8527954816818237, "num_tokens": 49239994.0, "step": 1291 }, { "epoch": 0.16435567993893907, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.844776153564453, "learning_rate": 5.472657905892327e-07, "loss": 0.4803, "mean_token_accuracy": 0.851393461227417, "num_tokens": 49273464.0, "step": 1292 }, { "epoch": 0.16448289021752957, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.577302932739258, "learning_rate": 5.476896990250106e-07, "loss": 0.5965, "mean_token_accuracy": 0.8156366944313049, "num_tokens": 49313144.0, "step": 1293 }, { "epoch": 0.1646101004961201, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.551797866821289, "learning_rate": 5.481136074607885e-07, "loss": 0.4815, "mean_token_accuracy": 0.8441622257232666, "num_tokens": 49351392.0, "step": 1294 }, { "epoch": 0.1647373107747106, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.701549530029297, "learning_rate": 5.485375158965663e-07, "loss": 0.4826, "mean_token_accuracy": 0.8489761352539062, "num_tokens": 49396726.0, "step": 1295 }, { "epoch": 0.1648645210533011, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.496255874633789, "learning_rate": 5.489614243323442e-07, "loss": 0.5166, "mean_token_accuracy": 0.8371303677558899, "num_tokens": 49435856.0, "step": 1296 }, { "epoch": 0.16499173133189163, "ewc_loss": 0.0174560546875, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.239776611328125e-05, "grad_norm": 15.701273918151855, "learning_rate": 5.49385332768122e-07, "loss": 0.4569, "mean_token_accuracy": 0.8540070056915283, "num_tokens": 49473695.0, "step": 1297 }, { "epoch": 0.16511894161048213, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2516975402832031e-05, "grad_norm": 15.550207138061523, "learning_rate": 5.498092412039e-07, "loss": 0.5853, "mean_token_accuracy": 0.821652889251709, "num_tokens": 49509165.0, "step": 1298 }, { "epoch": 0.16524615188907263, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2516975402832031e-05, "grad_norm": 15.693472862243652, "learning_rate": 5.502331496396778e-07, "loss": 0.489, "mean_token_accuracy": 0.844904899597168, "num_tokens": 49553790.0, "step": 1299 }, { "epoch": 0.16537336216766316, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.65869426727295, "learning_rate": 5.506570580754557e-07, "loss": 0.5119, "mean_token_accuracy": 0.8401262164115906, "num_tokens": 49593124.0, "step": 1300 }, { "epoch": 0.16550057244625366, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.452461242675781, "learning_rate": 5.510809665112336e-07, "loss": 0.5009, "mean_token_accuracy": 0.8454029560089111, "num_tokens": 49632016.0, "step": 1301 }, { "epoch": 0.16562778272484416, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.245737075805664e-05, "grad_norm": 15.51745319366455, "learning_rate": 5.515048749470113e-07, "loss": 0.5136, "mean_token_accuracy": 0.83943772315979, "num_tokens": 49670689.0, "step": 1302 }, { "epoch": 0.1657549930034347, "ewc_loss": 0.0177001953125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2576580047607422e-05, "grad_norm": 15.606889724731445, "learning_rate": 5.519287833827893e-07, "loss": 0.5001, "mean_token_accuracy": 0.84242182970047, "num_tokens": 49709230.0, "step": 1303 }, { "epoch": 0.1658822032820252, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2755393981933594e-05, "grad_norm": 15.509525299072266, "learning_rate": 5.523526918185671e-07, "loss": 0.5581, "mean_token_accuracy": 0.8238904476165771, "num_tokens": 49741648.0, "step": 1304 }, { "epoch": 0.1660094135606157, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2755393981933594e-05, "grad_norm": 15.66331958770752, "learning_rate": 5.52776600254345e-07, "loss": 0.4755, "mean_token_accuracy": 0.8466579914093018, "num_tokens": 49773906.0, "step": 1305 }, { "epoch": 0.16613662383920622, "ewc_loss": 0.017578125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2516975402832031e-05, "grad_norm": 15.45759391784668, "learning_rate": 5.532005086901229e-07, "loss": 0.5326, "mean_token_accuracy": 0.8370829820632935, "num_tokens": 49812645.0, "step": 1306 }, { "epoch": 0.16626383411779672, "ewc_loss": 0.0177001953125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2636184692382812e-05, "grad_norm": 15.609639167785645, "learning_rate": 5.536244171259008e-07, "loss": 0.5022, "mean_token_accuracy": 0.8432493805885315, "num_tokens": 49854500.0, "step": 1307 }, { "epoch": 0.16639104439638722, "ewc_loss": 0.0177001953125, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2576580047607422e-05, "grad_norm": 15.59636402130127, "learning_rate": 5.540483255616786e-07, "loss": 0.5965, "mean_token_accuracy": 0.8161735534667969, "num_tokens": 49889408.0, "step": 1308 }, { "epoch": 0.16651825467497774, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2755393981933594e-05, "grad_norm": 15.726046562194824, "learning_rate": 5.544722339974566e-07, "loss": 0.521, "mean_token_accuracy": 0.8385438919067383, "num_tokens": 49923581.0, "step": 1309 }, { "epoch": 0.16664546495356825, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2695789337158203e-05, "grad_norm": 15.508020401000977, "learning_rate": 5.548961424332343e-07, "loss": 0.5096, "mean_token_accuracy": 0.8367517590522766, "num_tokens": 49959433.0, "step": 1310 }, { "epoch": 0.16677267523215875, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2695789337158203e-05, "grad_norm": 15.552116394042969, "learning_rate": 5.553200508690123e-07, "loss": 0.4767, "mean_token_accuracy": 0.8515021800994873, "num_tokens": 50000307.0, "step": 1311 }, { "epoch": 0.16689988551074927, "ewc_loss": 0.0179443359375, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 1.2814998626708984e-05, "grad_norm": 15.822860717773438, "learning_rate": 5.557439593047901e-07, "loss": 0.5715, "mean_token_accuracy": 0.8199528455734253, "num_tokens": 50035587.0, "step": 1312 }, { "epoch": 0.16702709578933977, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.125999450683594e-06, "ewc_loss_parallel": 1.2636184692382812e-05, "grad_norm": 15.638909339904785, "learning_rate": 5.56167867740568e-07, "loss": 0.5078, "mean_token_accuracy": 0.8416368961334229, "num_tokens": 50079425.0, "step": 1313 }, { "epoch": 0.16715430606793028, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.125999450683594e-06, "ewc_loss_parallel": 1.2695789337158203e-05, "grad_norm": 15.36053466796875, "learning_rate": 5.565917761763459e-07, "loss": 0.4895, "mean_token_accuracy": 0.8472788333892822, "num_tokens": 50110674.0, "step": 1314 }, { "epoch": 0.1672815163465208, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.125999450683594e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.617674827575684, "learning_rate": 5.570156846121238e-07, "loss": 0.5304, "mean_token_accuracy": 0.832497239112854, "num_tokens": 50149033.0, "step": 1315 }, { "epoch": 0.1674087266251113, "ewc_loss": 0.017822265625, "ewc_loss_diag": 5.125999450683594e-06, "ewc_loss_parallel": 1.2755393981933594e-05, "grad_norm": 15.656845092773438, "learning_rate": 5.574395930479016e-07, "loss": 0.5489, "mean_token_accuracy": 0.8306581377983093, "num_tokens": 50185728.0, "step": 1316 }, { "epoch": 0.16753593690370183, "ewc_loss": 0.0179443359375, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2814998626708984e-05, "grad_norm": 15.628299713134766, "learning_rate": 5.578635014836796e-07, "loss": 0.4993, "mean_token_accuracy": 0.8441432118415833, "num_tokens": 50219671.0, "step": 1317 }, { "epoch": 0.16766314718229233, "ewc_loss": 0.0179443359375, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2755393981933594e-05, "grad_norm": 15.750894546508789, "learning_rate": 5.582874099194573e-07, "loss": 0.5622, "mean_token_accuracy": 0.8284924626350403, "num_tokens": 50251989.0, "step": 1318 }, { "epoch": 0.16779035746088283, "ewc_loss": 0.0179443359375, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2814998626708984e-05, "grad_norm": 15.607121467590332, "learning_rate": 5.587113183552353e-07, "loss": 0.5452, "mean_token_accuracy": 0.8313395977020264, "num_tokens": 50291079.0, "step": 1319 }, { "epoch": 0.16791756773947336, "ewc_loss": 0.0179443359375, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2814998626708984e-05, "grad_norm": 15.669173240661621, "learning_rate": 5.591352267910131e-07, "loss": 0.4992, "mean_token_accuracy": 0.8441469669342041, "num_tokens": 50332262.0, "step": 1320 }, { "epoch": 0.16804477801806386, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2934207916259766e-05, "grad_norm": 15.72243881225586, "learning_rate": 5.59559135226791e-07, "loss": 0.5295, "mean_token_accuracy": 0.8373430967330933, "num_tokens": 50371278.0, "step": 1321 }, { "epoch": 0.16817198829665436, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.887245178222656, "learning_rate": 5.599830436625689e-07, "loss": 0.5083, "mean_token_accuracy": 0.8406343460083008, "num_tokens": 50409463.0, "step": 1322 }, { "epoch": 0.1682991985752449, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.77790355682373, "learning_rate": 5.604069520983468e-07, "loss": 0.5012, "mean_token_accuracy": 0.8434407711029053, "num_tokens": 50446745.0, "step": 1323 }, { "epoch": 0.1684264088538354, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 1.2814998626708984e-05, "grad_norm": 15.835307121276855, "learning_rate": 5.608308605341246e-07, "loss": 0.4928, "mean_token_accuracy": 0.8440618515014648, "num_tokens": 50479725.0, "step": 1324 }, { "epoch": 0.1685536191324259, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.90706729888916, "learning_rate": 5.612547689699024e-07, "loss": 0.5565, "mean_token_accuracy": 0.8266810178756714, "num_tokens": 50514734.0, "step": 1325 }, { "epoch": 0.16868082941101642, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.56332015991211, "learning_rate": 5.616786774056803e-07, "loss": 0.5347, "mean_token_accuracy": 0.8352147340774536, "num_tokens": 50555165.0, "step": 1326 }, { "epoch": 0.16880803968960692, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.758637428283691, "learning_rate": 5.621025858414582e-07, "loss": 0.5364, "mean_token_accuracy": 0.831735372543335, "num_tokens": 50598251.0, "step": 1327 }, { "epoch": 0.16893524996819742, "ewc_loss": 0.0181884765625, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 1.2993812561035156e-05, "grad_norm": 15.773377418518066, "learning_rate": 5.625264942772361e-07, "loss": 0.5285, "mean_token_accuracy": 0.8371549844741821, "num_tokens": 50638423.0, "step": 1328 }, { "epoch": 0.16906246024678795, "ewc_loss": 0.01806640625, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 1.2874603271484375e-05, "grad_norm": 15.607495307922363, "learning_rate": 5.629504027130139e-07, "loss": 0.5878, "mean_token_accuracy": 0.8256410360336304, "num_tokens": 50678226.0, "step": 1329 }, { "epoch": 0.16918967052537845, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2993812561035156e-05, "grad_norm": 15.823058128356934, "learning_rate": 5.633743111487919e-07, "loss": 0.5873, "mean_token_accuracy": 0.8142207264900208, "num_tokens": 50717542.0, "step": 1330 }, { "epoch": 0.16931688080396895, "ewc_loss": 0.0181884765625, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2934207916259766e-05, "grad_norm": 15.722732543945312, "learning_rate": 5.637982195845697e-07, "loss": 0.4665, "mean_token_accuracy": 0.8531857132911682, "num_tokens": 50752914.0, "step": 1331 }, { "epoch": 0.16944409108255948, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3113021850585938e-05, "grad_norm": 16.056114196777344, "learning_rate": 5.642221280203476e-07, "loss": 0.465, "mean_token_accuracy": 0.8554307818412781, "num_tokens": 50789276.0, "step": 1332 }, { "epoch": 0.16957130136114998, "ewc_loss": 0.0181884765625, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2934207916259766e-05, "grad_norm": 15.68416690826416, "learning_rate": 5.646460364561254e-07, "loss": 0.4865, "mean_token_accuracy": 0.8454360961914062, "num_tokens": 50820759.0, "step": 1333 }, { "epoch": 0.16969851163974048, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2993812561035156e-05, "grad_norm": 15.737168312072754, "learning_rate": 5.650699448919033e-07, "loss": 0.5776, "mean_token_accuracy": 0.8214818239212036, "num_tokens": 50863241.0, "step": 1334 }, { "epoch": 0.169825721918331, "ewc_loss": 0.0181884765625, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2934207916259766e-05, "grad_norm": 15.593027114868164, "learning_rate": 5.654938533276812e-07, "loss": 0.5174, "mean_token_accuracy": 0.8378602266311646, "num_tokens": 50904296.0, "step": 1335 }, { "epoch": 0.1699529321969215, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2993812561035156e-05, "grad_norm": 15.752808570861816, "learning_rate": 5.659177617634591e-07, "loss": 0.5016, "mean_token_accuracy": 0.8438203930854797, "num_tokens": 50942180.0, "step": 1336 }, { "epoch": 0.170080142475512, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.2993812561035156e-05, "grad_norm": 15.702685356140137, "learning_rate": 5.663416701992369e-07, "loss": 0.5292, "mean_token_accuracy": 0.8356294631958008, "num_tokens": 50980287.0, "step": 1337 }, { "epoch": 0.17020735275410254, "ewc_loss": 0.0184326171875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3172626495361328e-05, "grad_norm": 15.721017837524414, "learning_rate": 5.667655786350149e-07, "loss": 0.5936, "mean_token_accuracy": 0.8098940849304199, "num_tokens": 51014218.0, "step": 1338 }, { "epoch": 0.17033456303269304, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3113021850585938e-05, "grad_norm": 15.862909317016602, "learning_rate": 5.671894870707927e-07, "loss": 0.5085, "mean_token_accuracy": 0.8432484865188599, "num_tokens": 51055915.0, "step": 1339 }, { "epoch": 0.17046177331128357, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3053417205810547e-05, "grad_norm": 15.731656074523926, "learning_rate": 5.676133955065705e-07, "loss": 0.4505, "mean_token_accuracy": 0.8553059101104736, "num_tokens": 51090684.0, "step": 1340 }, { "epoch": 0.17058898358987407, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3113021850585938e-05, "grad_norm": 15.937555313110352, "learning_rate": 5.680373039423484e-07, "loss": 0.4992, "mean_token_accuracy": 0.8431072235107422, "num_tokens": 51130156.0, "step": 1341 }, { "epoch": 0.17071619386846457, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.329183578491211e-05, "grad_norm": 15.930961608886719, "learning_rate": 5.684612123781263e-07, "loss": 0.5393, "mean_token_accuracy": 0.830122172832489, "num_tokens": 51171282.0, "step": 1342 }, { "epoch": 0.1708434041470551, "ewc_loss": 0.018310546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3053417205810547e-05, "grad_norm": 15.77702808380127, "learning_rate": 5.688851208139042e-07, "loss": 0.5563, "mean_token_accuracy": 0.825541079044342, "num_tokens": 51210616.0, "step": 1343 }, { "epoch": 0.1709706144256456, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 1.3232231140136719e-05, "grad_norm": 15.79061508178711, "learning_rate": 5.69309029249682e-07, "loss": 0.5424, "mean_token_accuracy": 0.8357358574867249, "num_tokens": 51253357.0, "step": 1344 }, { "epoch": 0.1710978247042361, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 1.329183578491211e-05, "grad_norm": 15.85169792175293, "learning_rate": 5.697329376854599e-07, "loss": 0.5199, "mean_token_accuracy": 0.8361309766769409, "num_tokens": 51292671.0, "step": 1345 }, { "epoch": 0.17122503498282662, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 1.329183578491211e-05, "grad_norm": 15.935185432434082, "learning_rate": 5.701568461212378e-07, "loss": 0.4605, "mean_token_accuracy": 0.8564701080322266, "num_tokens": 51329145.0, "step": 1346 }, { "epoch": 0.17135224526141712, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 1.329183578491211e-05, "grad_norm": 15.841935157775879, "learning_rate": 5.705807545570157e-07, "loss": 0.469, "mean_token_accuracy": 0.8518480062484741, "num_tokens": 51371265.0, "step": 1347 }, { "epoch": 0.17147945554000762, "ewc_loss": 0.0184326171875, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 1.3172626495361328e-05, "grad_norm": 15.890849113464355, "learning_rate": 5.710046629927934e-07, "loss": 0.5076, "mean_token_accuracy": 0.838710606098175, "num_tokens": 51410262.0, "step": 1348 }, { "epoch": 0.17160666581859815, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 1.3232231140136719e-05, "grad_norm": 15.872880935668945, "learning_rate": 5.714285714285714e-07, "loss": 0.4771, "mean_token_accuracy": 0.8498203754425049, "num_tokens": 51447796.0, "step": 1349 }, { "epoch": 0.17173387609718865, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.329183578491211e-05, "grad_norm": 16.119325637817383, "learning_rate": 5.718524798643492e-07, "loss": 0.5421, "mean_token_accuracy": 0.8345941305160522, "num_tokens": 51482677.0, "step": 1350 }, { "epoch": 0.17186108637577915, "ewc_loss": 0.0186767578125, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.33514404296875e-05, "grad_norm": 16.11060905456543, "learning_rate": 5.722763883001272e-07, "loss": 0.5413, "mean_token_accuracy": 0.8342854380607605, "num_tokens": 51524111.0, "step": 1351 }, { "epoch": 0.17198829665436968, "ewc_loss": 0.0186767578125, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.33514404296875e-05, "grad_norm": 15.955883979797363, "learning_rate": 5.72700296735905e-07, "loss": 0.5147, "mean_token_accuracy": 0.8382635116577148, "num_tokens": 51561008.0, "step": 1352 }, { "epoch": 0.17211550693296018, "ewc_loss": 0.0186767578125, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.33514404296875e-05, "grad_norm": 15.976346969604492, "learning_rate": 5.731242051716829e-07, "loss": 0.5686, "mean_token_accuracy": 0.8200005888938904, "num_tokens": 51603091.0, "step": 1353 }, { "epoch": 0.17224271721155068, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.3232231140136719e-05, "grad_norm": 15.907078742980957, "learning_rate": 5.735481136074608e-07, "loss": 0.4836, "mean_token_accuracy": 0.8464881181716919, "num_tokens": 51637789.0, "step": 1354 }, { "epoch": 0.1723699274901412, "ewc_loss": 0.018798828125, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.3470649719238281e-05, "grad_norm": 15.997956275939941, "learning_rate": 5.739720220432386e-07, "loss": 0.4887, "mean_token_accuracy": 0.8434731960296631, "num_tokens": 51674207.0, "step": 1355 }, { "epoch": 0.1724971377687317, "ewc_loss": 0.0185546875, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.3172626495361328e-05, "grad_norm": 16.106369018554688, "learning_rate": 5.743959304790164e-07, "loss": 0.5853, "mean_token_accuracy": 0.821130633354187, "num_tokens": 51715109.0, "step": 1356 }, { "epoch": 0.1726243480473222, "ewc_loss": 0.018798828125, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 1.3470649719238281e-05, "grad_norm": 15.801384925842285, "learning_rate": 5.748198389147944e-07, "loss": 0.5323, "mean_token_accuracy": 0.8357933759689331, "num_tokens": 51753373.0, "step": 1357 }, { "epoch": 0.17275155832591274, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3530254364013672e-05, "grad_norm": 15.851815223693848, "learning_rate": 5.752437473505722e-07, "loss": 0.5704, "mean_token_accuracy": 0.8259499669075012, "num_tokens": 51795100.0, "step": 1358 }, { "epoch": 0.17287876860450324, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3530254364013672e-05, "grad_norm": 15.925105094909668, "learning_rate": 5.756676557863502e-07, "loss": 0.544, "mean_token_accuracy": 0.8299223184585571, "num_tokens": 51831840.0, "step": 1359 }, { "epoch": 0.17300597888309374, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3589859008789062e-05, "grad_norm": 15.824069023132324, "learning_rate": 5.76091564222128e-07, "loss": 0.4956, "mean_token_accuracy": 0.8453519344329834, "num_tokens": 51870227.0, "step": 1360 }, { "epoch": 0.17313318916168427, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3589859008789062e-05, "grad_norm": 16.028621673583984, "learning_rate": 5.765154726579059e-07, "loss": 0.522, "mean_token_accuracy": 0.8376803398132324, "num_tokens": 51908251.0, "step": 1361 }, { "epoch": 0.17326039944027477, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3530254364013672e-05, "grad_norm": 16.101537704467773, "learning_rate": 5.769393810936838e-07, "loss": 0.5109, "mean_token_accuracy": 0.8410308361053467, "num_tokens": 51944531.0, "step": 1362 }, { "epoch": 0.17338760971886527, "ewc_loss": 0.0191650390625, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3828277587890625e-05, "grad_norm": 16.322938919067383, "learning_rate": 5.773632895294616e-07, "loss": 0.5666, "mean_token_accuracy": 0.8232740163803101, "num_tokens": 51985742.0, "step": 1363 }, { "epoch": 0.1735148199974558, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3589859008789062e-05, "grad_norm": 15.957715034484863, "learning_rate": 5.777871979652394e-07, "loss": 0.5013, "mean_token_accuracy": 0.8436400890350342, "num_tokens": 52022036.0, "step": 1364 }, { "epoch": 0.1736420302760463, "ewc_loss": 0.0189208984375, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3589859008789062e-05, "grad_norm": 16.175695419311523, "learning_rate": 5.782111064010173e-07, "loss": 0.5244, "mean_token_accuracy": 0.8351541757583618, "num_tokens": 52066077.0, "step": 1365 }, { "epoch": 0.17376924055463683, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 1.3709068298339844e-05, "grad_norm": 16.017597198486328, "learning_rate": 5.786350148367952e-07, "loss": 0.5158, "mean_token_accuracy": 0.8387014865875244, "num_tokens": 52111280.0, "step": 1366 }, { "epoch": 0.17389645083322733, "ewc_loss": 0.0191650390625, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3768672943115234e-05, "grad_norm": 16.022037506103516, "learning_rate": 5.790589232725731e-07, "loss": 0.5133, "mean_token_accuracy": 0.8374621868133545, "num_tokens": 52153148.0, "step": 1367 }, { "epoch": 0.17402366111181783, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3589859008789062e-05, "grad_norm": 16.153654098510742, "learning_rate": 5.79482831708351e-07, "loss": 0.4981, "mean_token_accuracy": 0.8442628979682922, "num_tokens": 52186722.0, "step": 1368 }, { "epoch": 0.17415087139040836, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3709068298339844e-05, "grad_norm": 16.175941467285156, "learning_rate": 5.799067401441288e-07, "loss": 0.5097, "mean_token_accuracy": 0.8382478952407837, "num_tokens": 52220837.0, "step": 1369 }, { "epoch": 0.17427808166899886, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3709068298339844e-05, "grad_norm": 16.221834182739258, "learning_rate": 5.803306485799068e-07, "loss": 0.494, "mean_token_accuracy": 0.8470169305801392, "num_tokens": 52257049.0, "step": 1370 }, { "epoch": 0.17440529194758936, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3649463653564453e-05, "grad_norm": 16.087614059448242, "learning_rate": 5.807545570156845e-07, "loss": 0.4704, "mean_token_accuracy": 0.850570559501648, "num_tokens": 52297632.0, "step": 1371 }, { "epoch": 0.1745325022261799, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3649463653564453e-05, "grad_norm": 16.189247131347656, "learning_rate": 5.811784654514624e-07, "loss": 0.5718, "mean_token_accuracy": 0.8231720924377441, "num_tokens": 52331008.0, "step": 1372 }, { "epoch": 0.1746597125047704, "ewc_loss": 0.01904296875, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3709068298339844e-05, "grad_norm": 16.176010131835938, "learning_rate": 5.816023738872403e-07, "loss": 0.5019, "mean_token_accuracy": 0.8442758321762085, "num_tokens": 52369420.0, "step": 1373 }, { "epoch": 0.1747869227833609, "ewc_loss": 0.0191650390625, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 1.3768672943115234e-05, "grad_norm": 16.093555450439453, "learning_rate": 5.820262823230182e-07, "loss": 0.5335, "mean_token_accuracy": 0.8345751762390137, "num_tokens": 52411759.0, "step": 1374 }, { "epoch": 0.17491413306195142, "ewc_loss": 0.0191650390625, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3709068298339844e-05, "grad_norm": 16.052528381347656, "learning_rate": 5.824501907587961e-07, "loss": 0.4925, "mean_token_accuracy": 0.845129132270813, "num_tokens": 52446459.0, "step": 1375 }, { "epoch": 0.17504134334054192, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3887882232666016e-05, "grad_norm": 16.10586929321289, "learning_rate": 5.82874099194574e-07, "loss": 0.5767, "mean_token_accuracy": 0.8197227120399475, "num_tokens": 52484615.0, "step": 1376 }, { "epoch": 0.17516855361913242, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3828277587890625e-05, "grad_norm": 16.004968643188477, "learning_rate": 5.832980076303518e-07, "loss": 0.5873, "mean_token_accuracy": 0.8173185586929321, "num_tokens": 52525195.0, "step": 1377 }, { "epoch": 0.17529576389772294, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3828277587890625e-05, "grad_norm": 16.289379119873047, "learning_rate": 5.837219160661297e-07, "loss": 0.469, "mean_token_accuracy": 0.8530780076980591, "num_tokens": 52558783.0, "step": 1378 }, { "epoch": 0.17542297417631345, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.4007091522216797e-05, "grad_norm": 16.321311950683594, "learning_rate": 5.841458245019075e-07, "loss": 0.4655, "mean_token_accuracy": 0.8531144857406616, "num_tokens": 52599176.0, "step": 1379 }, { "epoch": 0.17555018445490395, "ewc_loss": 0.0191650390625, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3768672943115234e-05, "grad_norm": 15.937642097473145, "learning_rate": 5.845697329376855e-07, "loss": 0.5226, "mean_token_accuracy": 0.8409318327903748, "num_tokens": 52630865.0, "step": 1380 }, { "epoch": 0.17567739473349447, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3887882232666016e-05, "grad_norm": 16.68610382080078, "learning_rate": 5.849936413734633e-07, "loss": 0.5739, "mean_token_accuracy": 0.8221777677536011, "num_tokens": 52671702.0, "step": 1381 }, { "epoch": 0.17580460501208497, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 1.3947486877441406e-05, "grad_norm": 16.232486724853516, "learning_rate": 5.854175498092412e-07, "loss": 0.4778, "mean_token_accuracy": 0.8505080938339233, "num_tokens": 52712288.0, "step": 1382 }, { "epoch": 0.17593181529067548, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3768672943115234e-05, "grad_norm": 16.135395050048828, "learning_rate": 5.858414582450191e-07, "loss": 0.5207, "mean_token_accuracy": 0.8391668796539307, "num_tokens": 52756569.0, "step": 1383 }, { "epoch": 0.176059025569266, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3887882232666016e-05, "grad_norm": 16.387601852416992, "learning_rate": 5.86265366680797e-07, "loss": 0.5437, "mean_token_accuracy": 0.8320345282554626, "num_tokens": 52796195.0, "step": 1384 }, { "epoch": 0.1761862358478565, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3828277587890625e-05, "grad_norm": 15.900676727294922, "learning_rate": 5.866892751165748e-07, "loss": 0.5157, "mean_token_accuracy": 0.8405542373657227, "num_tokens": 52839687.0, "step": 1385 }, { "epoch": 0.176313446126447, "ewc_loss": 0.019287109375, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3887882232666016e-05, "grad_norm": 16.36607551574707, "learning_rate": 5.871131835523526e-07, "loss": 0.5697, "mean_token_accuracy": 0.8229636549949646, "num_tokens": 52875660.0, "step": 1386 }, { "epoch": 0.17644065640503753, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3947486877441406e-05, "grad_norm": 16.049882888793945, "learning_rate": 5.875370919881305e-07, "loss": 0.5265, "mean_token_accuracy": 0.8344110250473022, "num_tokens": 52921220.0, "step": 1387 }, { "epoch": 0.17656786668362803, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.3947486877441406e-05, "grad_norm": 16.503141403198242, "learning_rate": 5.879610004239084e-07, "loss": 0.58, "mean_token_accuracy": 0.8228307366371155, "num_tokens": 52960042.0, "step": 1388 }, { "epoch": 0.17669507696221853, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4066696166992188e-05, "grad_norm": 15.92834758758545, "learning_rate": 5.883849088596863e-07, "loss": 0.5632, "mean_token_accuracy": 0.8257245421409607, "num_tokens": 53002817.0, "step": 1389 }, { "epoch": 0.17682228724080906, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4007091522216797e-05, "grad_norm": 16.401779174804688, "learning_rate": 5.888088172954641e-07, "loss": 0.538, "mean_token_accuracy": 0.8335150480270386, "num_tokens": 53037751.0, "step": 1390 }, { "epoch": 0.17694949751939956, "ewc_loss": 0.0196533203125, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.4185905456542969e-05, "grad_norm": 15.956275939941406, "learning_rate": 5.892327257312421e-07, "loss": 0.4479, "mean_token_accuracy": 0.860924243927002, "num_tokens": 53074286.0, "step": 1391 }, { "epoch": 0.1770767077979901, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4126300811767578e-05, "grad_norm": 16.471134185791016, "learning_rate": 5.896566341670199e-07, "loss": 0.4714, "mean_token_accuracy": 0.8474565744400024, "num_tokens": 53113525.0, "step": 1392 }, { "epoch": 0.1772039180765806, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.2036075592041, "learning_rate": 5.900805426027977e-07, "loss": 0.5148, "mean_token_accuracy": 0.839332103729248, "num_tokens": 53150137.0, "step": 1393 }, { "epoch": 0.1773311283551711, "ewc_loss": 0.0194091796875, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4007091522216797e-05, "grad_norm": 16.051006317138672, "learning_rate": 5.905044510385756e-07, "loss": 0.4657, "mean_token_accuracy": 0.8532124757766724, "num_tokens": 53189519.0, "step": 1394 }, { "epoch": 0.17745833863376162, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4185905456542969e-05, "grad_norm": 16.32986831665039, "learning_rate": 5.909283594743535e-07, "loss": 0.5802, "mean_token_accuracy": 0.8201754689216614, "num_tokens": 53227452.0, "step": 1395 }, { "epoch": 0.17758554891235212, "ewc_loss": 0.0196533203125, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.424551010131836e-05, "grad_norm": 16.352256774902344, "learning_rate": 5.913522679101314e-07, "loss": 0.5118, "mean_token_accuracy": 0.8363107442855835, "num_tokens": 53265875.0, "step": 1396 }, { "epoch": 0.17771275919094262, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.424022674560547e-06, "ewc_loss_parallel": 1.4066696166992188e-05, "grad_norm": 16.40257453918457, "learning_rate": 5.917761763459093e-07, "loss": 0.5714, "mean_token_accuracy": 0.821154773235321, "num_tokens": 53305477.0, "step": 1397 }, { "epoch": 0.17783996946953315, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.424551010131836e-05, "grad_norm": 16.073169708251953, "learning_rate": 5.922000847816871e-07, "loss": 0.4893, "mean_token_accuracy": 0.8478637337684631, "num_tokens": 53342800.0, "step": 1398 }, { "epoch": 0.17796717974812365, "ewc_loss": 0.0196533203125, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 1.4185905456542969e-05, "grad_norm": 16.60580825805664, "learning_rate": 5.926239932174651e-07, "loss": 0.5546, "mean_token_accuracy": 0.8287679553031921, "num_tokens": 53387476.0, "step": 1399 }, { "epoch": 0.17809439002671415, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.424551010131836e-05, "grad_norm": 16.30879020690918, "learning_rate": 5.930479016532429e-07, "loss": 0.5118, "mean_token_accuracy": 0.8398197293281555, "num_tokens": 53424628.0, "step": 1400 }, { "epoch": 0.17822160030530468, "ewc_loss": 0.0196533203125, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 1.424551010131836e-05, "grad_norm": 16.079513549804688, "learning_rate": 5.934718100890207e-07, "loss": 0.4936, "mean_token_accuracy": 0.8424100875854492, "num_tokens": 53464459.0, "step": 1401 }, { "epoch": 0.17834881058389518, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 1.4066696166992188e-05, "grad_norm": 16.4020938873291, "learning_rate": 5.938957185247986e-07, "loss": 0.5395, "mean_token_accuracy": 0.8293029069900513, "num_tokens": 53501721.0, "step": 1402 }, { "epoch": 0.17847602086248568, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.228193283081055, "learning_rate": 5.943196269605765e-07, "loss": 0.5009, "mean_token_accuracy": 0.8400145769119263, "num_tokens": 53536607.0, "step": 1403 }, { "epoch": 0.1786032311410762, "ewc_loss": 0.01953125, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.4126300811767578e-05, "grad_norm": 16.07645606994629, "learning_rate": 5.947435353963544e-07, "loss": 0.5533, "mean_token_accuracy": 0.8315255641937256, "num_tokens": 53569809.0, "step": 1404 }, { "epoch": 0.1787304414196667, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.424551010131836e-05, "grad_norm": 16.127174377441406, "learning_rate": 5.951674438321323e-07, "loss": 0.5286, "mean_token_accuracy": 0.8353145122528076, "num_tokens": 53604423.0, "step": 1405 }, { "epoch": 0.1788576516982572, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.086435317993164, "learning_rate": 5.955913522679101e-07, "loss": 0.5257, "mean_token_accuracy": 0.8333963751792908, "num_tokens": 53642145.0, "step": 1406 }, { "epoch": 0.17898486197684774, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.312255859375, "learning_rate": 5.96015260703688e-07, "loss": 0.6016, "mean_token_accuracy": 0.8111861348152161, "num_tokens": 53682998.0, "step": 1407 }, { "epoch": 0.17911207225543824, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 1.436471939086914e-05, "grad_norm": 16.136940002441406, "learning_rate": 5.964391691394659e-07, "loss": 0.5336, "mean_token_accuracy": 0.8359869718551636, "num_tokens": 53729764.0, "step": 1408 }, { "epoch": 0.17923928253402874, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.312894821166992, "learning_rate": 5.968630775752436e-07, "loss": 0.5683, "mean_token_accuracy": 0.8242913484573364, "num_tokens": 53764522.0, "step": 1409 }, { "epoch": 0.17936649281261927, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.453197479248047, "learning_rate": 5.972869860110216e-07, "loss": 0.5498, "mean_token_accuracy": 0.8250975608825684, "num_tokens": 53799791.0, "step": 1410 }, { "epoch": 0.17949370309120977, "ewc_loss": 0.019775390625, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.430511474609375e-05, "grad_norm": 16.11321449279785, "learning_rate": 5.977108944467994e-07, "loss": 0.5875, "mean_token_accuracy": 0.8219059705734253, "num_tokens": 53840181.0, "step": 1411 }, { "epoch": 0.17962091336980027, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.4754695892334, "learning_rate": 5.981348028825774e-07, "loss": 0.4928, "mean_token_accuracy": 0.8487801551818848, "num_tokens": 53882813.0, "step": 1412 }, { "epoch": 0.1797481236483908, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.34213638305664, "learning_rate": 5.985587113183552e-07, "loss": 0.5427, "mean_token_accuracy": 0.8323584198951721, "num_tokens": 53922198.0, "step": 1413 }, { "epoch": 0.1798753339269813, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.4836273193359375e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.3582763671875, "learning_rate": 5.989826197541331e-07, "loss": 0.5369, "mean_token_accuracy": 0.8348175883293152, "num_tokens": 53959390.0, "step": 1414 }, { "epoch": 0.18000254420557182, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.281667709350586, "learning_rate": 5.99406528189911e-07, "loss": 0.5667, "mean_token_accuracy": 0.8294365406036377, "num_tokens": 54005707.0, "step": 1415 }, { "epoch": 0.18012975448416232, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.55852699279785, "learning_rate": 5.998304366256888e-07, "loss": 0.5292, "mean_token_accuracy": 0.8341628313064575, "num_tokens": 54047808.0, "step": 1416 }, { "epoch": 0.18025696476275282, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.493980407714844, "learning_rate": 6.002543450614666e-07, "loss": 0.4526, "mean_token_accuracy": 0.8608202338218689, "num_tokens": 54082929.0, "step": 1417 }, { "epoch": 0.18038417504134335, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.474231719970703, "learning_rate": 6.006782534972446e-07, "loss": 0.5145, "mean_token_accuracy": 0.8396873474121094, "num_tokens": 54120199.0, "step": 1418 }, { "epoch": 0.18051138531993385, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4424324035644531e-05, "grad_norm": 16.500152587890625, "learning_rate": 6.011021619330224e-07, "loss": 0.5501, "mean_token_accuracy": 0.8278611898422241, "num_tokens": 54158629.0, "step": 1419 }, { "epoch": 0.18063859559852435, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.49225616455078, "learning_rate": 6.015260703688004e-07, "loss": 0.5299, "mean_token_accuracy": 0.8318992853164673, "num_tokens": 54194772.0, "step": 1420 }, { "epoch": 0.18076580587711488, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.38788604736328, "learning_rate": 6.019499788045782e-07, "loss": 0.539, "mean_token_accuracy": 0.8327590227127075, "num_tokens": 54229825.0, "step": 1421 }, { "epoch": 0.18089301615570538, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.26898765563965, "learning_rate": 6.023738872403561e-07, "loss": 0.4939, "mean_token_accuracy": 0.842728316783905, "num_tokens": 54260051.0, "step": 1422 }, { "epoch": 0.18102022643429588, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.4495849609375, "learning_rate": 6.02797795676134e-07, "loss": 0.5203, "mean_token_accuracy": 0.8417260646820068, "num_tokens": 54297865.0, "step": 1423 }, { "epoch": 0.1811474367128864, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.370540618896484, "learning_rate": 6.032217041119118e-07, "loss": 0.5364, "mean_token_accuracy": 0.8289446830749512, "num_tokens": 54334618.0, "step": 1424 }, { "epoch": 0.1812746469914769, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.23809242248535, "learning_rate": 6.036456125476896e-07, "loss": 0.5737, "mean_token_accuracy": 0.8221185207366943, "num_tokens": 54369761.0, "step": 1425 }, { "epoch": 0.1814018572700674, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.5283260345459, "learning_rate": 6.040695209834675e-07, "loss": 0.4715, "mean_token_accuracy": 0.8503060340881348, "num_tokens": 54411549.0, "step": 1426 }, { "epoch": 0.18152906754865794, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.66238021850586, "learning_rate": 6.044934294192454e-07, "loss": 0.5561, "mean_token_accuracy": 0.8287453651428223, "num_tokens": 54449137.0, "step": 1427 }, { "epoch": 0.18165627782724844, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.54011344909668, "learning_rate": 6.049173378550233e-07, "loss": 0.511, "mean_token_accuracy": 0.8429827094078064, "num_tokens": 54494604.0, "step": 1428 }, { "epoch": 0.18178348810583894, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4543533325195312e-05, "grad_norm": 16.581064224243164, "learning_rate": 6.053412462908012e-07, "loss": 0.4915, "mean_token_accuracy": 0.8448514938354492, "num_tokens": 54533533.0, "step": 1429 }, { "epoch": 0.18191069838442947, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.730562210083008, "learning_rate": 6.05765154726579e-07, "loss": 0.5644, "mean_token_accuracy": 0.8258182406425476, "num_tokens": 54575211.0, "step": 1430 }, { "epoch": 0.18203790866301997, "ewc_loss": 0.02001953125, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4483928680419922e-05, "grad_norm": 16.611995697021484, "learning_rate": 6.061890631623569e-07, "loss": 0.468, "mean_token_accuracy": 0.8523236513137817, "num_tokens": 54618308.0, "step": 1431 }, { "epoch": 0.18216511894161047, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.691553115844727, "learning_rate": 6.066129715981347e-07, "loss": 0.507, "mean_token_accuracy": 0.8414138555526733, "num_tokens": 54660260.0, "step": 1432 }, { "epoch": 0.182292329220201, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.506820678710938, "learning_rate": 6.070368800339126e-07, "loss": 0.481, "mean_token_accuracy": 0.8483322262763977, "num_tokens": 54692056.0, "step": 1433 }, { "epoch": 0.1824195394987915, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.699790954589844, "learning_rate": 6.074607884696905e-07, "loss": 0.501, "mean_token_accuracy": 0.842112123966217, "num_tokens": 54729252.0, "step": 1434 }, { "epoch": 0.182546749777382, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.66986656188965, "learning_rate": 6.078846969054684e-07, "loss": 0.5249, "mean_token_accuracy": 0.8341892957687378, "num_tokens": 54763422.0, "step": 1435 }, { "epoch": 0.18267396005597253, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.46466636657715, "learning_rate": 6.083086053412463e-07, "loss": 0.498, "mean_token_accuracy": 0.8446297645568848, "num_tokens": 54803494.0, "step": 1436 }, { "epoch": 0.18280117033456303, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.608583450317383, "learning_rate": 6.087325137770242e-07, "loss": 0.4894, "mean_token_accuracy": 0.8461459279060364, "num_tokens": 54839349.0, "step": 1437 }, { "epoch": 0.18292838061315353, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.553184509277344, "learning_rate": 6.09156422212802e-07, "loss": 0.5631, "mean_token_accuracy": 0.8291735053062439, "num_tokens": 54881941.0, "step": 1438 }, { "epoch": 0.18305559089174406, "ewc_loss": 0.0201416015625, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 1.4603137969970703e-05, "grad_norm": 16.60692596435547, "learning_rate": 6.095803306485799e-07, "loss": 0.5126, "mean_token_accuracy": 0.8371871709823608, "num_tokens": 54922927.0, "step": 1439 }, { "epoch": 0.18318280117033456, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.74640464782715, "learning_rate": 6.100042390843577e-07, "loss": 0.5421, "mean_token_accuracy": 0.8306201696395874, "num_tokens": 54963885.0, "step": 1440 }, { "epoch": 0.1833100114489251, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.563716888427734, "learning_rate": 6.104281475201356e-07, "loss": 0.49, "mean_token_accuracy": 0.8446162939071655, "num_tokens": 55001284.0, "step": 1441 }, { "epoch": 0.1834372217275156, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4662742614746094e-05, "grad_norm": 16.57772445678711, "learning_rate": 6.108520559559135e-07, "loss": 0.5507, "mean_token_accuracy": 0.8290917873382568, "num_tokens": 55034813.0, "step": 1442 }, { "epoch": 0.1835644320061061, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.640626907348633, "learning_rate": 6.112759643916914e-07, "loss": 0.4552, "mean_token_accuracy": 0.8566104173660278, "num_tokens": 55070620.0, "step": 1443 }, { "epoch": 0.18369164228469662, "ewc_loss": 0.0203857421875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4781951904296875e-05, "grad_norm": 16.55537986755371, "learning_rate": 6.116998728274693e-07, "loss": 0.5381, "mean_token_accuracy": 0.837773323059082, "num_tokens": 55108394.0, "step": 1444 }, { "epoch": 0.18381885256328712, "ewc_loss": 0.0203857421875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4841556549072266e-05, "grad_norm": 16.723142623901367, "learning_rate": 6.121237812632472e-07, "loss": 0.5429, "mean_token_accuracy": 0.830082893371582, "num_tokens": 55148170.0, "step": 1445 }, { "epoch": 0.18394606284187762, "ewc_loss": 0.0203857421875, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.4781951904296875e-05, "grad_norm": 16.651813507080078, "learning_rate": 6.125476896990249e-07, "loss": 0.4829, "mean_token_accuracy": 0.8503615856170654, "num_tokens": 55188102.0, "step": 1446 }, { "epoch": 0.18407327312046814, "ewc_loss": 0.020263671875, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.4722347259521484e-05, "grad_norm": 16.6279354095459, "learning_rate": 6.129715981348028e-07, "loss": 0.5225, "mean_token_accuracy": 0.8413265943527222, "num_tokens": 55225893.0, "step": 1447 }, { "epoch": 0.18420048339905865, "ewc_loss": 0.0203857421875, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4841556549072266e-05, "grad_norm": 16.657407760620117, "learning_rate": 6.133955065705807e-07, "loss": 0.5421, "mean_token_accuracy": 0.8289095163345337, "num_tokens": 55265321.0, "step": 1448 }, { "epoch": 0.18432769367764915, "ewc_loss": 0.0205078125, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 1.4901161193847656e-05, "grad_norm": 16.48067283630371, "learning_rate": 6.138194150063585e-07, "loss": 0.5016, "mean_token_accuracy": 0.8454762697219849, "num_tokens": 55304179.0, "step": 1449 }, { "epoch": 0.18445490395623967, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.771011352539062, "learning_rate": 6.142433234421365e-07, "loss": 0.5366, "mean_token_accuracy": 0.8325845003128052, "num_tokens": 55343266.0, "step": 1450 }, { "epoch": 0.18458211423483017, "ewc_loss": 0.0205078125, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.4960765838623047e-05, "grad_norm": 16.567819595336914, "learning_rate": 6.146672318779143e-07, "loss": 0.541, "mean_token_accuracy": 0.8324258327484131, "num_tokens": 55381147.0, "step": 1451 }, { "epoch": 0.18470932451342068, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.5079975128173828e-05, "grad_norm": 16.578962326049805, "learning_rate": 6.150911403136923e-07, "loss": 0.5413, "mean_token_accuracy": 0.8341002464294434, "num_tokens": 55414855.0, "step": 1452 }, { "epoch": 0.1848365347920112, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.5079975128173828e-05, "grad_norm": 16.551759719848633, "learning_rate": 6.155150487494701e-07, "loss": 0.4638, "mean_token_accuracy": 0.8541514873504639, "num_tokens": 55449699.0, "step": 1453 }, { "epoch": 0.1849637450706017, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 1.5139579772949219e-05, "grad_norm": 16.876558303833008, "learning_rate": 6.159389571852479e-07, "loss": 0.4915, "mean_token_accuracy": 0.8476496934890747, "num_tokens": 55493202.0, "step": 1454 }, { "epoch": 0.1850909553491922, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.541790008544922, "learning_rate": 6.163628656210258e-07, "loss": 0.5149, "mean_token_accuracy": 0.8407831192016602, "num_tokens": 55539873.0, "step": 1455 }, { "epoch": 0.18521816562778273, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.5139579772949219e-05, "grad_norm": 16.745664596557617, "learning_rate": 6.167867740568037e-07, "loss": 0.5744, "mean_token_accuracy": 0.8226376175880432, "num_tokens": 55576338.0, "step": 1456 }, { "epoch": 0.18534537590637323, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.776248931884766, "learning_rate": 6.172106824925815e-07, "loss": 0.4936, "mean_token_accuracy": 0.8447114825248718, "num_tokens": 55618330.0, "step": 1457 }, { "epoch": 0.18547258618496373, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.5079975128173828e-05, "grad_norm": 16.634456634521484, "learning_rate": 6.176345909283595e-07, "loss": 0.511, "mean_token_accuracy": 0.8401740789413452, "num_tokens": 55657394.0, "step": 1458 }, { "epoch": 0.18559979646355426, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.892513275146484, "learning_rate": 6.180584993641373e-07, "loss": 0.4973, "mean_token_accuracy": 0.8446688652038574, "num_tokens": 55695087.0, "step": 1459 }, { "epoch": 0.18572700674214476, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.5139579772949219e-05, "grad_norm": 16.764982223510742, "learning_rate": 6.184824077999153e-07, "loss": 0.4804, "mean_token_accuracy": 0.8494575619697571, "num_tokens": 55730852.0, "step": 1460 }, { "epoch": 0.18585421702073526, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 17.022899627685547, "learning_rate": 6.189063162356931e-07, "loss": 0.5777, "mean_token_accuracy": 0.8214768171310425, "num_tokens": 55765678.0, "step": 1461 }, { "epoch": 0.1859814272993258, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.925081253051758, "learning_rate": 6.193302246714709e-07, "loss": 0.5029, "mean_token_accuracy": 0.8453587889671326, "num_tokens": 55804409.0, "step": 1462 }, { "epoch": 0.1861086375779163, "ewc_loss": 0.020751953125, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 1.5079975128173828e-05, "grad_norm": 16.880229949951172, "learning_rate": 6.197541331072488e-07, "loss": 0.5324, "mean_token_accuracy": 0.8344346284866333, "num_tokens": 55839520.0, "step": 1463 }, { "epoch": 0.1862358478565068, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.848215103149414, "learning_rate": 6.201780415430267e-07, "loss": 0.5122, "mean_token_accuracy": 0.840237021446228, "num_tokens": 55879089.0, "step": 1464 }, { "epoch": 0.18636305813509732, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 1.519918441772461e-05, "grad_norm": 16.781211853027344, "learning_rate": 6.206019499788045e-07, "loss": 0.5576, "mean_token_accuracy": 0.8276112079620361, "num_tokens": 55916655.0, "step": 1465 }, { "epoch": 0.18649026841368782, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.77446937561035, "learning_rate": 6.210258584145825e-07, "loss": 0.4904, "mean_token_accuracy": 0.8433880805969238, "num_tokens": 55953828.0, "step": 1466 }, { "epoch": 0.18661747869227835, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.632638931274414e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.798992156982422, "learning_rate": 6.214497668503603e-07, "loss": 0.5384, "mean_token_accuracy": 0.8286365270614624, "num_tokens": 55992037.0, "step": 1467 }, { "epoch": 0.18674468897086885, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.960886001586914, "learning_rate": 6.218736752861383e-07, "loss": 0.4998, "mean_token_accuracy": 0.8454060554504395, "num_tokens": 56020424.0, "step": 1468 }, { "epoch": 0.18687189924945935, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.907907485961914, "learning_rate": 6.22297583721916e-07, "loss": 0.5305, "mean_token_accuracy": 0.8406732082366943, "num_tokens": 56060839.0, "step": 1469 }, { "epoch": 0.18699910952804988, "ewc_loss": 0.0208740234375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.5139579772949219e-05, "grad_norm": 16.867475509643555, "learning_rate": 6.227214921576938e-07, "loss": 0.5629, "mean_token_accuracy": 0.8296210169792175, "num_tokens": 56099095.0, "step": 1470 }, { "epoch": 0.18712631980664038, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 16.973674774169922, "learning_rate": 6.231454005934718e-07, "loss": 0.4573, "mean_token_accuracy": 0.8573947548866272, "num_tokens": 56133805.0, "step": 1471 }, { "epoch": 0.18725353008523088, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 16.7384033203125, "learning_rate": 6.235693090292496e-07, "loss": 0.5783, "mean_token_accuracy": 0.8238108158111572, "num_tokens": 56173064.0, "step": 1472 }, { "epoch": 0.1873807403638214, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 17.186635971069336, "learning_rate": 6.239932174650275e-07, "loss": 0.4577, "mean_token_accuracy": 0.8598483204841614, "num_tokens": 56213285.0, "step": 1473 }, { "epoch": 0.1875079506424119, "ewc_loss": 0.021240234375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.5497207641601562e-05, "grad_norm": 16.96658706665039, "learning_rate": 6.244171259008054e-07, "loss": 0.5423, "mean_token_accuracy": 0.8291717767715454, "num_tokens": 56248218.0, "step": 1474 }, { "epoch": 0.1876351609210024, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.965042114257812, "learning_rate": 6.248410343365833e-07, "loss": 0.4853, "mean_token_accuracy": 0.8499987721443176, "num_tokens": 56286098.0, "step": 1475 }, { "epoch": 0.18776237119959294, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 17.27486801147461, "learning_rate": 6.252649427723612e-07, "loss": 0.4931, "mean_token_accuracy": 0.8476934432983398, "num_tokens": 56329732.0, "step": 1476 }, { "epoch": 0.18788958147818344, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 16.86240577697754, "learning_rate": 6.25688851208139e-07, "loss": 0.4981, "mean_token_accuracy": 0.8459399938583374, "num_tokens": 56367838.0, "step": 1477 }, { "epoch": 0.18801679175677394, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 16.908544540405273, "learning_rate": 6.261127596439168e-07, "loss": 0.5503, "mean_token_accuracy": 0.8281154632568359, "num_tokens": 56413059.0, "step": 1478 }, { "epoch": 0.18814400203536447, "ewc_loss": 0.02099609375, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.52587890625e-05, "grad_norm": 17.324356079101562, "learning_rate": 6.265366680796948e-07, "loss": 0.4533, "mean_token_accuracy": 0.8533479571342468, "num_tokens": 56453806.0, "step": 1479 }, { "epoch": 0.18827121231395497, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 16.62721061706543, "learning_rate": 6.269605765154726e-07, "loss": 0.5017, "mean_token_accuracy": 0.8426319360733032, "num_tokens": 56497448.0, "step": 1480 }, { "epoch": 0.18839842259254547, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 17.030536651611328, "learning_rate": 6.273844849512505e-07, "loss": 0.5031, "mean_token_accuracy": 0.8419311046600342, "num_tokens": 56537731.0, "step": 1481 }, { "epoch": 0.188525632871136, "ewc_loss": 0.021240234375, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 1.5497207641601562e-05, "grad_norm": 17.058557510375977, "learning_rate": 6.278083933870284e-07, "loss": 0.4425, "mean_token_accuracy": 0.8621734380722046, "num_tokens": 56580944.0, "step": 1482 }, { "epoch": 0.1886528431497265, "ewc_loss": 0.0211181640625, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 1.537799835205078e-05, "grad_norm": 16.586027145385742, "learning_rate": 6.282323018228063e-07, "loss": 0.475, "mean_token_accuracy": 0.8522317409515381, "num_tokens": 56616306.0, "step": 1483 }, { "epoch": 0.188780053428317, "ewc_loss": 0.021240234375, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5497207641601562e-05, "grad_norm": 17.035612106323242, "learning_rate": 6.286562102585841e-07, "loss": 0.5551, "mean_token_accuracy": 0.8315657377243042, "num_tokens": 56661874.0, "step": 1484 }, { "epoch": 0.18890726370690752, "ewc_loss": 0.021240234375, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5497207641601562e-05, "grad_norm": 16.830772399902344, "learning_rate": 6.29080118694362e-07, "loss": 0.4498, "mean_token_accuracy": 0.8567143678665161, "num_tokens": 56696697.0, "step": 1485 }, { "epoch": 0.18903447398549802, "ewc_loss": 0.021240234375, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5497207641601562e-05, "grad_norm": 16.755268096923828, "learning_rate": 6.295040271301398e-07, "loss": 0.4498, "mean_token_accuracy": 0.8569761514663696, "num_tokens": 56732581.0, "step": 1486 }, { "epoch": 0.18916168426408853, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 16.996315002441406, "learning_rate": 6.299279355659178e-07, "loss": 0.4999, "mean_token_accuracy": 0.8447368741035461, "num_tokens": 56777905.0, "step": 1487 }, { "epoch": 0.18928889454267905, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.00953483581543, "learning_rate": 6.303518440016956e-07, "loss": 0.5195, "mean_token_accuracy": 0.8365393877029419, "num_tokens": 56812915.0, "step": 1488 }, { "epoch": 0.18941610482126955, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 16.874982833862305, "learning_rate": 6.307757524374735e-07, "loss": 0.5148, "mean_token_accuracy": 0.8418969511985779, "num_tokens": 56846907.0, "step": 1489 }, { "epoch": 0.18954331509986008, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.152570724487305, "learning_rate": 6.311996608732514e-07, "loss": 0.5416, "mean_token_accuracy": 0.8362593650817871, "num_tokens": 56884614.0, "step": 1490 }, { "epoch": 0.18967052537845058, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.004161834716797, "learning_rate": 6.316235693090292e-07, "loss": 0.534, "mean_token_accuracy": 0.8332067131996155, "num_tokens": 56920508.0, "step": 1491 }, { "epoch": 0.18979773565704108, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 16.85991859436035, "learning_rate": 6.320474777448071e-07, "loss": 0.5366, "mean_token_accuracy": 0.8319489359855652, "num_tokens": 56957629.0, "step": 1492 }, { "epoch": 0.1899249459356316, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 16.802785873413086, "learning_rate": 6.324713861805849e-07, "loss": 0.4949, "mean_token_accuracy": 0.8434799909591675, "num_tokens": 56997265.0, "step": 1493 }, { "epoch": 0.1900521562142221, "ewc_loss": 0.0213623046875, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.282846450805664, "learning_rate": 6.328952946163628e-07, "loss": 0.5991, "mean_token_accuracy": 0.8184353113174438, "num_tokens": 57031519.0, "step": 1494 }, { "epoch": 0.1901793664928126, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 16.832246780395508, "learning_rate": 6.333192030521407e-07, "loss": 0.4685, "mean_token_accuracy": 0.850145161151886, "num_tokens": 57067264.0, "step": 1495 }, { "epoch": 0.19030657677140314, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.086679458618164, "learning_rate": 6.337431114879186e-07, "loss": 0.5233, "mean_token_accuracy": 0.8375622034072876, "num_tokens": 57104906.0, "step": 1496 }, { "epoch": 0.19043378704999364, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.20203971862793, "learning_rate": 6.341670199236965e-07, "loss": 0.4872, "mean_token_accuracy": 0.8485921025276184, "num_tokens": 57142534.0, "step": 1497 }, { "epoch": 0.19056099732858414, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 16.885339736938477, "learning_rate": 6.345909283594744e-07, "loss": 0.4701, "mean_token_accuracy": 0.8532764911651611, "num_tokens": 57188735.0, "step": 1498 }, { "epoch": 0.19068820760717467, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5616416931152344e-05, "grad_norm": 17.075042724609375, "learning_rate": 6.350148367952522e-07, "loss": 0.5234, "mean_token_accuracy": 0.83115553855896, "num_tokens": 57224610.0, "step": 1499 }, { "epoch": 0.19081541788576517, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.07977867126465, "learning_rate": 6.354387452310301e-07, "loss": 0.4772, "mean_token_accuracy": 0.8476324677467346, "num_tokens": 57259553.0, "step": 1500 }, { "epoch": 0.19094262816435567, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.165739059448242, "learning_rate": 6.358626536668079e-07, "loss": 0.4823, "mean_token_accuracy": 0.8496835231781006, "num_tokens": 57295819.0, "step": 1501 }, { "epoch": 0.1910698384429462, "ewc_loss": 0.021484375, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 16.94681739807129, "learning_rate": 6.362865621025858e-07, "loss": 0.469, "mean_token_accuracy": 0.8548409938812256, "num_tokens": 57339110.0, "step": 1502 }, { "epoch": 0.1911970487215367, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.073246002197266, "learning_rate": 6.367104705383637e-07, "loss": 0.4992, "mean_token_accuracy": 0.8440870046615601, "num_tokens": 57377311.0, "step": 1503 }, { "epoch": 0.1913242590001272, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.119104385375977, "learning_rate": 6.371343789741416e-07, "loss": 0.5116, "mean_token_accuracy": 0.8369472622871399, "num_tokens": 57419281.0, "step": 1504 }, { "epoch": 0.19145146927871773, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 16.96651840209961, "learning_rate": 6.375582874099195e-07, "loss": 0.4979, "mean_token_accuracy": 0.845608115196228, "num_tokens": 57458134.0, "step": 1505 }, { "epoch": 0.19157867955730823, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.185049057006836, "learning_rate": 6.379821958456974e-07, "loss": 0.5535, "mean_token_accuracy": 0.8272844552993774, "num_tokens": 57492931.0, "step": 1506 }, { "epoch": 0.19170588983589873, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.107397079467773, "learning_rate": 6.384061042814751e-07, "loss": 0.5238, "mean_token_accuracy": 0.8358079195022583, "num_tokens": 57529581.0, "step": 1507 }, { "epoch": 0.19183310011448926, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 16.957984924316406, "learning_rate": 6.38830012717253e-07, "loss": 0.5108, "mean_token_accuracy": 0.8415905833244324, "num_tokens": 57570302.0, "step": 1508 }, { "epoch": 0.19196031039307976, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.14682388305664, "learning_rate": 6.392539211530309e-07, "loss": 0.5242, "mean_token_accuracy": 0.8352476358413696, "num_tokens": 57615974.0, "step": 1509 }, { "epoch": 0.19208752067167026, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.0904598236084, "learning_rate": 6.396778295888087e-07, "loss": 0.4874, "mean_token_accuracy": 0.849656343460083, "num_tokens": 57653737.0, "step": 1510 }, { "epoch": 0.1922147309502608, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.249971389770508, "learning_rate": 6.401017380245867e-07, "loss": 0.5212, "mean_token_accuracy": 0.8362177014350891, "num_tokens": 57693431.0, "step": 1511 }, { "epoch": 0.1923419412288513, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.173553466796875, "learning_rate": 6.405256464603645e-07, "loss": 0.506, "mean_token_accuracy": 0.8409433364868164, "num_tokens": 57736413.0, "step": 1512 }, { "epoch": 0.1924691515074418, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 16.970060348510742, "learning_rate": 6.409495548961425e-07, "loss": 0.4904, "mean_token_accuracy": 0.8490724563598633, "num_tokens": 57773293.0, "step": 1513 }, { "epoch": 0.19259636178603232, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.324338912963867, "learning_rate": 6.413734633319203e-07, "loss": 0.5032, "mean_token_accuracy": 0.8448711037635803, "num_tokens": 57809294.0, "step": 1514 }, { "epoch": 0.19272357206462282, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.10457420349121, "learning_rate": 6.417973717676981e-07, "loss": 0.4734, "mean_token_accuracy": 0.8501760959625244, "num_tokens": 57843610.0, "step": 1515 }, { "epoch": 0.19285078234321335, "ewc_loss": 0.021728515625, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 1.5854835510253906e-05, "grad_norm": 17.413732528686523, "learning_rate": 6.42221280203476e-07, "loss": 0.534, "mean_token_accuracy": 0.8307210803031921, "num_tokens": 57893098.0, "step": 1516 }, { "epoch": 0.19297799262180385, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.8710575103759766e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.15647315979004, "learning_rate": 6.426451886392539e-07, "loss": 0.5611, "mean_token_accuracy": 0.8302667737007141, "num_tokens": 57933098.0, "step": 1517 }, { "epoch": 0.19310520290039435, "ewc_loss": 0.021728515625, "ewc_loss_diag": 5.900859832763672e-06, "ewc_loss_parallel": 1.5854835510253906e-05, "grad_norm": 17.212175369262695, "learning_rate": 6.430690970750317e-07, "loss": 0.4925, "mean_token_accuracy": 0.8462051153182983, "num_tokens": 57971502.0, "step": 1518 }, { "epoch": 0.19323241317898487, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.900859832763672e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.28212547302246, "learning_rate": 6.434930055108097e-07, "loss": 0.5434, "mean_token_accuracy": 0.8287407159805298, "num_tokens": 58001557.0, "step": 1519 }, { "epoch": 0.19335962345757537, "ewc_loss": 0.0216064453125, "ewc_loss_diag": 5.900859832763672e-06, "ewc_loss_parallel": 1.5735626220703125e-05, "grad_norm": 17.267730712890625, "learning_rate": 6.439169139465875e-07, "loss": 0.5525, "mean_token_accuracy": 0.8272191882133484, "num_tokens": 58038058.0, "step": 1520 }, { "epoch": 0.19348683373616588, "ewc_loss": 0.021728515625, "ewc_loss_diag": 5.900859832763672e-06, "ewc_loss_parallel": 1.5854835510253906e-05, "grad_norm": 17.254837036132812, "learning_rate": 6.443408223823655e-07, "loss": 0.519, "mean_token_accuracy": 0.8370181918144226, "num_tokens": 58073138.0, "step": 1521 }, { "epoch": 0.1936140440147564, "ewc_loss": 0.021728515625, "ewc_loss_diag": 5.900859832763672e-06, "ewc_loss_parallel": 1.5854835510253906e-05, "grad_norm": 17.17220115661621, "learning_rate": 6.447647308181432e-07, "loss": 0.5105, "mean_token_accuracy": 0.8388180732727051, "num_tokens": 58111793.0, "step": 1522 }, { "epoch": 0.1937412542933469, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5974044799804688e-05, "grad_norm": 17.09317970275879, "learning_rate": 6.451886392539211e-07, "loss": 0.4979, "mean_token_accuracy": 0.8440105319023132, "num_tokens": 58152911.0, "step": 1523 }, { "epoch": 0.1938684645719374, "ewc_loss": 0.021728515625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5854835510253906e-05, "grad_norm": 17.317934036254883, "learning_rate": 6.45612547689699e-07, "loss": 0.5162, "mean_token_accuracy": 0.8398026823997498, "num_tokens": 58192095.0, "step": 1524 }, { "epoch": 0.19399567485052793, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.022830963134766, "learning_rate": 6.460364561254769e-07, "loss": 0.4707, "mean_token_accuracy": 0.8527258038520813, "num_tokens": 58232651.0, "step": 1525 }, { "epoch": 0.19412288512911843, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5974044799804688e-05, "grad_norm": 17.168582916259766, "learning_rate": 6.464603645612547e-07, "loss": 0.4676, "mean_token_accuracy": 0.8535875082015991, "num_tokens": 58270802.0, "step": 1526 }, { "epoch": 0.19425009540770893, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.274499893188477, "learning_rate": 6.468842729970327e-07, "loss": 0.5002, "mean_token_accuracy": 0.8465245962142944, "num_tokens": 58313179.0, "step": 1527 }, { "epoch": 0.19437730568629946, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.170541763305664, "learning_rate": 6.473081814328105e-07, "loss": 0.5395, "mean_token_accuracy": 0.8361151814460754, "num_tokens": 58354728.0, "step": 1528 }, { "epoch": 0.19450451596488996, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.537818908691406, "learning_rate": 6.477320898685885e-07, "loss": 0.5247, "mean_token_accuracy": 0.839097261428833, "num_tokens": 58398649.0, "step": 1529 }, { "epoch": 0.19463172624348046, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5974044799804688e-05, "grad_norm": 17.15472412109375, "learning_rate": 6.481559983043662e-07, "loss": 0.5351, "mean_token_accuracy": 0.8324442505836487, "num_tokens": 58438261.0, "step": 1530 }, { "epoch": 0.194758936522071, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5974044799804688e-05, "grad_norm": 17.430530548095703, "learning_rate": 6.48579906740144e-07, "loss": 0.4796, "mean_token_accuracy": 0.8491149544715881, "num_tokens": 58479141.0, "step": 1531 }, { "epoch": 0.1948861468006615, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.56015968322754, "learning_rate": 6.49003815175922e-07, "loss": 0.448, "mean_token_accuracy": 0.8603495359420776, "num_tokens": 58515700.0, "step": 1532 }, { "epoch": 0.195013357079252, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.5974044799804688e-05, "grad_norm": 17.294641494750977, "learning_rate": 6.494277236116998e-07, "loss": 0.4538, "mean_token_accuracy": 0.858959436416626, "num_tokens": 58549825.0, "step": 1533 }, { "epoch": 0.19514056735784252, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.325599670410156, "learning_rate": 6.498516320474777e-07, "loss": 0.5938, "mean_token_accuracy": 0.8196769952774048, "num_tokens": 58596110.0, "step": 1534 }, { "epoch": 0.19526777763643302, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.4449405670166, "learning_rate": 6.502755404832556e-07, "loss": 0.5046, "mean_token_accuracy": 0.8387967944145203, "num_tokens": 58632410.0, "step": 1535 }, { "epoch": 0.19539498791502352, "ewc_loss": 0.02197265625, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 16.971471786499023, "learning_rate": 6.506994489190335e-07, "loss": 0.5675, "mean_token_accuracy": 0.8288507461547852, "num_tokens": 58671339.0, "step": 1536 }, { "epoch": 0.19552219819361405, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.378177642822266, "learning_rate": 6.511233573548114e-07, "loss": 0.4658, "mean_token_accuracy": 0.8532673120498657, "num_tokens": 58711786.0, "step": 1537 }, { "epoch": 0.19564940847220455, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.153730392456055, "learning_rate": 6.515472657905892e-07, "loss": 0.5093, "mean_token_accuracy": 0.8403714895248413, "num_tokens": 58743600.0, "step": 1538 }, { "epoch": 0.19577661875079505, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.431957244873047, "learning_rate": 6.51971174226367e-07, "loss": 0.5676, "mean_token_accuracy": 0.8238270282745361, "num_tokens": 58780622.0, "step": 1539 }, { "epoch": 0.19590382902938558, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.380624771118164, "learning_rate": 6.52395082662145e-07, "loss": 0.5614, "mean_token_accuracy": 0.8254996538162231, "num_tokens": 58819370.0, "step": 1540 }, { "epoch": 0.19603103930797608, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.072914123535156, "learning_rate": 6.528189910979228e-07, "loss": 0.4835, "mean_token_accuracy": 0.8480303883552551, "num_tokens": 58856270.0, "step": 1541 }, { "epoch": 0.1961582495865666, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.44898223876953, "learning_rate": 6.532428995337007e-07, "loss": 0.4998, "mean_token_accuracy": 0.8435432314872742, "num_tokens": 58898702.0, "step": 1542 }, { "epoch": 0.1962854598651571, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.378314971923828, "learning_rate": 6.536668079694786e-07, "loss": 0.5223, "mean_token_accuracy": 0.8342589735984802, "num_tokens": 58934602.0, "step": 1543 }, { "epoch": 0.1964126701437476, "ewc_loss": 0.0220947265625, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.609325408935547e-05, "grad_norm": 17.30266761779785, "learning_rate": 6.540907164052565e-07, "loss": 0.5069, "mean_token_accuracy": 0.8413147330284119, "num_tokens": 58977766.0, "step": 1544 }, { "epoch": 0.19653988042233814, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.504554748535156, "learning_rate": 6.545146248410343e-07, "loss": 0.5182, "mean_token_accuracy": 0.8378844261169434, "num_tokens": 59019649.0, "step": 1545 }, { "epoch": 0.19666709070092864, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.355228424072266, "learning_rate": 6.549385332768122e-07, "loss": 0.5299, "mean_token_accuracy": 0.8369354605674744, "num_tokens": 59060543.0, "step": 1546 }, { "epoch": 0.19679430097951914, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.415061950683594, "learning_rate": 6.5536244171259e-07, "loss": 0.5281, "mean_token_accuracy": 0.8337849378585815, "num_tokens": 59100049.0, "step": 1547 }, { "epoch": 0.19692151125810967, "ewc_loss": 0.0223388671875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.633167266845703e-05, "grad_norm": 17.49250030517578, "learning_rate": 6.55786350148368e-07, "loss": 0.5362, "mean_token_accuracy": 0.8320220708847046, "num_tokens": 59137370.0, "step": 1548 }, { "epoch": 0.19704872153670017, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.45657730102539, "learning_rate": 6.562102585841458e-07, "loss": 0.4886, "mean_token_accuracy": 0.8437382578849792, "num_tokens": 59176196.0, "step": 1549 }, { "epoch": 0.19717593181529067, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.365358352661133, "learning_rate": 6.566341670199236e-07, "loss": 0.4863, "mean_token_accuracy": 0.8486002087593079, "num_tokens": 59216215.0, "step": 1550 }, { "epoch": 0.1973031420938812, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.3382568359375, "learning_rate": 6.570580754557016e-07, "loss": 0.5032, "mean_token_accuracy": 0.8418949246406555, "num_tokens": 59254173.0, "step": 1551 }, { "epoch": 0.1974303523724717, "ewc_loss": 0.0223388671875, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.633167266845703e-05, "grad_norm": 17.20782470703125, "learning_rate": 6.574819838914794e-07, "loss": 0.4747, "mean_token_accuracy": 0.8510817289352417, "num_tokens": 59288496.0, "step": 1552 }, { "epoch": 0.1975575626510622, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.411672592163086, "learning_rate": 6.579058923272573e-07, "loss": 0.5352, "mean_token_accuracy": 0.8320448398590088, "num_tokens": 59329043.0, "step": 1553 }, { "epoch": 0.19768477292965272, "ewc_loss": 0.0224609375, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.6450881958007812e-05, "grad_norm": 17.546398162841797, "learning_rate": 6.583298007630351e-07, "loss": 0.5258, "mean_token_accuracy": 0.8376563787460327, "num_tokens": 59360743.0, "step": 1554 }, { "epoch": 0.19781198320824323, "ewc_loss": 0.022216796875, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.621246337890625e-05, "grad_norm": 17.295673370361328, "learning_rate": 6.58753709198813e-07, "loss": 0.5208, "mean_token_accuracy": 0.837181031703949, "num_tokens": 59404167.0, "step": 1555 }, { "epoch": 0.19793919348683373, "ewc_loss": 0.0224609375, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.6450881958007812e-05, "grad_norm": 17.35795783996582, "learning_rate": 6.591776176345909e-07, "loss": 0.4883, "mean_token_accuracy": 0.844171404838562, "num_tokens": 59447770.0, "step": 1556 }, { "epoch": 0.19806640376542425, "ewc_loss": 0.0223388671875, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 1.633167266845703e-05, "grad_norm": 17.34510612487793, "learning_rate": 6.596015260703688e-07, "loss": 0.4984, "mean_token_accuracy": 0.8480902910232544, "num_tokens": 59494989.0, "step": 1557 }, { "epoch": 0.19819361404401475, "ewc_loss": 0.0223388671875, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.633167266845703e-05, "grad_norm": 17.509536743164062, "learning_rate": 6.600254345061466e-07, "loss": 0.4853, "mean_token_accuracy": 0.8491241931915283, "num_tokens": 59534926.0, "step": 1558 }, { "epoch": 0.19832082432260525, "ewc_loss": 0.0223388671875, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.633167266845703e-05, "grad_norm": 17.212446212768555, "learning_rate": 6.604493429419246e-07, "loss": 0.5405, "mean_token_accuracy": 0.8327782154083252, "num_tokens": 59577832.0, "step": 1559 }, { "epoch": 0.19844803460119578, "ewc_loss": 0.0225830078125, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.56861686706543, "learning_rate": 6.608732513777023e-07, "loss": 0.5485, "mean_token_accuracy": 0.8375657200813293, "num_tokens": 59624995.0, "step": 1560 }, { "epoch": 0.19857524487978628, "ewc_loss": 0.0224609375, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6450881958007812e-05, "grad_norm": 17.26915168762207, "learning_rate": 6.612971598134803e-07, "loss": 0.5006, "mean_token_accuracy": 0.8422713279724121, "num_tokens": 59663000.0, "step": 1561 }, { "epoch": 0.19870245515837678, "ewc_loss": 0.0225830078125, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.587108612060547, "learning_rate": 6.617210682492581e-07, "loss": 0.5576, "mean_token_accuracy": 0.8276380300521851, "num_tokens": 59700241.0, "step": 1562 }, { "epoch": 0.1988296654369673, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.65080451965332, "learning_rate": 6.62144976685036e-07, "loss": 0.4738, "mean_token_accuracy": 0.8537251949310303, "num_tokens": 59737244.0, "step": 1563 }, { "epoch": 0.1989568757155578, "ewc_loss": 0.0225830078125, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.35573959350586, "learning_rate": 6.625688851208139e-07, "loss": 0.5238, "mean_token_accuracy": 0.8373908996582031, "num_tokens": 59775538.0, "step": 1564 }, { "epoch": 0.19908408599414834, "ewc_loss": 0.0224609375, "ewc_loss_diag": 6.020069122314453e-06, "ewc_loss_parallel": 1.6450881958007812e-05, "grad_norm": 17.402284622192383, "learning_rate": 6.629927935565918e-07, "loss": 0.5179, "mean_token_accuracy": 0.8363758325576782, "num_tokens": 59810373.0, "step": 1565 }, { "epoch": 0.19921129627273884, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.530588150024414, "learning_rate": 6.634167019923696e-07, "loss": 0.5497, "mean_token_accuracy": 0.8330421447753906, "num_tokens": 59849992.0, "step": 1566 }, { "epoch": 0.19933850655132934, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.509326934814453, "learning_rate": 6.638406104281476e-07, "loss": 0.5105, "mean_token_accuracy": 0.8390858173370361, "num_tokens": 59882552.0, "step": 1567 }, { "epoch": 0.19946571682991987, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.434982299804688, "learning_rate": 6.642645188639253e-07, "loss": 0.4863, "mean_token_accuracy": 0.846566915512085, "num_tokens": 59925090.0, "step": 1568 }, { "epoch": 0.19959292710851037, "ewc_loss": 0.0224609375, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6450881958007812e-05, "grad_norm": 17.297277450561523, "learning_rate": 6.646884272997032e-07, "loss": 0.5357, "mean_token_accuracy": 0.8338562846183777, "num_tokens": 59965406.0, "step": 1569 }, { "epoch": 0.19972013738710087, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.529773712158203, "learning_rate": 6.651123357354811e-07, "loss": 0.5229, "mean_token_accuracy": 0.8374522924423218, "num_tokens": 60001770.0, "step": 1570 }, { "epoch": 0.1998473476656914, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.65192985534668, "learning_rate": 6.655362441712589e-07, "loss": 0.5386, "mean_token_accuracy": 0.8317141532897949, "num_tokens": 60036911.0, "step": 1571 }, { "epoch": 0.1999745579442819, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.350576400756836, "learning_rate": 6.659601526070369e-07, "loss": 0.5657, "mean_token_accuracy": 0.8286788463592529, "num_tokens": 60080210.0, "step": 1572 }, { "epoch": 0.2001017682228724, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.440195083618164, "learning_rate": 6.663840610428147e-07, "loss": 0.602, "mean_token_accuracy": 0.8202509880065918, "num_tokens": 60111533.0, "step": 1573 }, { "epoch": 0.20022897850146293, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.079673767089844e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.622554779052734, "learning_rate": 6.668079694785926e-07, "loss": 0.4752, "mean_token_accuracy": 0.8518082499504089, "num_tokens": 60150733.0, "step": 1574 }, { "epoch": 0.20035618878005343, "ewc_loss": 0.0228271484375, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.38059425354004, "learning_rate": 6.672318779143704e-07, "loss": 0.4959, "mean_token_accuracy": 0.8456517457962036, "num_tokens": 60185034.0, "step": 1575 }, { "epoch": 0.20048339905864393, "ewc_loss": 0.0228271484375, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.66459083557129, "learning_rate": 6.676557863501483e-07, "loss": 0.5284, "mean_token_accuracy": 0.8366471529006958, "num_tokens": 60216865.0, "step": 1576 }, { "epoch": 0.20061060933723446, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.715362548828125, "learning_rate": 6.680796947859262e-07, "loss": 0.518, "mean_token_accuracy": 0.8380476832389832, "num_tokens": 60257685.0, "step": 1577 }, { "epoch": 0.20073781961582496, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.2631778717041, "learning_rate": 6.685036032217041e-07, "loss": 0.5099, "mean_token_accuracy": 0.8419275283813477, "num_tokens": 60294258.0, "step": 1578 }, { "epoch": 0.20086502989441546, "ewc_loss": 0.0228271484375, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.70858383178711, "learning_rate": 6.689275116574819e-07, "loss": 0.5007, "mean_token_accuracy": 0.8476959466934204, "num_tokens": 60330319.0, "step": 1579 }, { "epoch": 0.200992240173006, "ewc_loss": 0.0228271484375, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.41163444519043, "learning_rate": 6.693514200932599e-07, "loss": 0.5182, "mean_token_accuracy": 0.8416764736175537, "num_tokens": 60370482.0, "step": 1580 }, { "epoch": 0.2011194504515965, "ewc_loss": 0.0228271484375, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.244400024414062, "learning_rate": 6.697753285290377e-07, "loss": 0.5238, "mean_token_accuracy": 0.8366527557373047, "num_tokens": 60412436.0, "step": 1581 }, { "epoch": 0.201246660730187, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.87801170349121, "learning_rate": 6.701992369648156e-07, "loss": 0.4462, "mean_token_accuracy": 0.8614977598190308, "num_tokens": 60448982.0, "step": 1582 }, { "epoch": 0.20137387100877752, "ewc_loss": 0.0230712890625, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.601543426513672, "learning_rate": 6.706231454005934e-07, "loss": 0.464, "mean_token_accuracy": 0.8535346984863281, "num_tokens": 60489878.0, "step": 1583 }, { "epoch": 0.20150108128736802, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.993309020996094, "learning_rate": 6.710470538363713e-07, "loss": 0.5, "mean_token_accuracy": 0.8467397689819336, "num_tokens": 60528979.0, "step": 1584 }, { "epoch": 0.20162829156595852, "ewc_loss": 0.022705078125, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6570091247558594e-05, "grad_norm": 17.48626136779785, "learning_rate": 6.714709622721492e-07, "loss": 0.5267, "mean_token_accuracy": 0.8345417976379395, "num_tokens": 60561670.0, "step": 1585 }, { "epoch": 0.20175550184454905, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.568302154541016, "learning_rate": 6.718948707079271e-07, "loss": 0.5026, "mean_token_accuracy": 0.8415582180023193, "num_tokens": 60602061.0, "step": 1586 }, { "epoch": 0.20188271212313955, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.764251708984375, "learning_rate": 6.723187791437049e-07, "loss": 0.5053, "mean_token_accuracy": 0.8419654965400696, "num_tokens": 60639805.0, "step": 1587 }, { "epoch": 0.20200992240173005, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.420412063598633, "learning_rate": 6.727426875794829e-07, "loss": 0.532, "mean_token_accuracy": 0.8355029821395874, "num_tokens": 60681816.0, "step": 1588 }, { "epoch": 0.20213713268032057, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.7689266204834, "learning_rate": 6.731665960152607e-07, "loss": 0.5164, "mean_token_accuracy": 0.8359149098396301, "num_tokens": 60726343.0, "step": 1589 }, { "epoch": 0.20226434295891108, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.55252456665039, "learning_rate": 6.735905044510385e-07, "loss": 0.56, "mean_token_accuracy": 0.8210057020187378, "num_tokens": 60764814.0, "step": 1590 }, { "epoch": 0.2023915532375016, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6689300537109375e-05, "grad_norm": 17.617942810058594, "learning_rate": 6.740144128868164e-07, "loss": 0.5275, "mean_token_accuracy": 0.8365973234176636, "num_tokens": 60802384.0, "step": 1591 }, { "epoch": 0.2025187635160921, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.796466827392578, "learning_rate": 6.744383213225942e-07, "loss": 0.5553, "mean_token_accuracy": 0.826626718044281, "num_tokens": 60842085.0, "step": 1592 }, { "epoch": 0.2026459737946826, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.887561798095703, "learning_rate": 6.748622297583722e-07, "loss": 0.4892, "mean_token_accuracy": 0.8429893255233765, "num_tokens": 60874835.0, "step": 1593 }, { "epoch": 0.20277318407327313, "ewc_loss": 0.0230712890625, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.47504234313965, "learning_rate": 6.7528613819415e-07, "loss": 0.5178, "mean_token_accuracy": 0.837081253528595, "num_tokens": 60910682.0, "step": 1594 }, { "epoch": 0.20290039435186363, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.691713333129883, "learning_rate": 6.757100466299279e-07, "loss": 0.5125, "mean_token_accuracy": 0.8429076671600342, "num_tokens": 60951352.0, "step": 1595 }, { "epoch": 0.20302760463045413, "ewc_loss": 0.02294921875, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 1.6808509826660156e-05, "grad_norm": 17.673999786376953, "learning_rate": 6.761339550657058e-07, "loss": 0.542, "mean_token_accuracy": 0.8301178812980652, "num_tokens": 60979055.0, "step": 1596 }, { "epoch": 0.20315481490904466, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.53961753845215, "learning_rate": 6.765578635014837e-07, "loss": 0.519, "mean_token_accuracy": 0.8413670659065247, "num_tokens": 61017437.0, "step": 1597 }, { "epoch": 0.20328202518763516, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.776931762695312, "learning_rate": 6.769817719372614e-07, "loss": 0.4953, "mean_token_accuracy": 0.8450718522071838, "num_tokens": 61052286.0, "step": 1598 }, { "epoch": 0.20340923546622566, "ewc_loss": 0.0233154296875, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.507192611694336, "learning_rate": 6.774056803730394e-07, "loss": 0.5407, "mean_token_accuracy": 0.8309535384178162, "num_tokens": 61086110.0, "step": 1599 }, { "epoch": 0.2035364457448162, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.466651916503906, "learning_rate": 6.778295888088172e-07, "loss": 0.5119, "mean_token_accuracy": 0.8404154777526855, "num_tokens": 61124181.0, "step": 1600 }, { "epoch": 0.2036636560234067, "ewc_loss": 0.0233154296875, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.61275863647461, "learning_rate": 6.782534972445952e-07, "loss": 0.5329, "mean_token_accuracy": 0.8325222730636597, "num_tokens": 61160881.0, "step": 1601 }, { "epoch": 0.2037908663019972, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.64778709411621, "learning_rate": 6.78677405680373e-07, "loss": 0.5301, "mean_token_accuracy": 0.8373125195503235, "num_tokens": 61200755.0, "step": 1602 }, { "epoch": 0.20391807658058772, "ewc_loss": 0.0233154296875, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.590505599975586, "learning_rate": 6.791013141161509e-07, "loss": 0.5211, "mean_token_accuracy": 0.836107611656189, "num_tokens": 61240045.0, "step": 1603 }, { "epoch": 0.20404528685917822, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.652986526489258, "learning_rate": 6.795252225519288e-07, "loss": 0.5461, "mean_token_accuracy": 0.8287869095802307, "num_tokens": 61282832.0, "step": 1604 }, { "epoch": 0.20417249713776872, "ewc_loss": 0.0233154296875, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.586095809936523, "learning_rate": 6.799491309877067e-07, "loss": 0.5192, "mean_token_accuracy": 0.8372239470481873, "num_tokens": 61319841.0, "step": 1605 }, { "epoch": 0.20429970741635925, "ewc_loss": 0.0234375, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.58050537109375, "learning_rate": 6.803730394234844e-07, "loss": 0.4889, "mean_token_accuracy": 0.8475660085678101, "num_tokens": 61352732.0, "step": 1606 }, { "epoch": 0.20442691769494975, "ewc_loss": 0.0234375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.827634811401367, "learning_rate": 6.807969478592624e-07, "loss": 0.5219, "mean_token_accuracy": 0.8394609689712524, "num_tokens": 61388324.0, "step": 1607 }, { "epoch": 0.20455412797354025, "ewc_loss": 0.0234375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.71971321105957, "learning_rate": 6.812208562950402e-07, "loss": 0.5459, "mean_token_accuracy": 0.8364435434341431, "num_tokens": 61431222.0, "step": 1608 }, { "epoch": 0.20468133825213078, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.669649124145508, "learning_rate": 6.816447647308182e-07, "loss": 0.5766, "mean_token_accuracy": 0.8217275142669678, "num_tokens": 61475617.0, "step": 1609 }, { "epoch": 0.20480854853072128, "ewc_loss": 0.023193359375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.6927719116210938e-05, "grad_norm": 17.861173629760742, "learning_rate": 6.82068673166596e-07, "loss": 0.5009, "mean_token_accuracy": 0.845991849899292, "num_tokens": 61511338.0, "step": 1610 }, { "epoch": 0.20493575880931178, "ewc_loss": 0.0234375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.65964698791504, "learning_rate": 6.824925816023738e-07, "loss": 0.5234, "mean_token_accuracy": 0.8350634574890137, "num_tokens": 61549093.0, "step": 1611 }, { "epoch": 0.2050629690879023, "ewc_loss": 0.0234375, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 1.704692840576172e-05, "grad_norm": 17.619674682617188, "learning_rate": 6.829164900381518e-07, "loss": 0.4905, "mean_token_accuracy": 0.8463406562805176, "num_tokens": 61593843.0, "step": 1612 }, { "epoch": 0.2051901793664928, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.65080451965332, "learning_rate": 6.833403984739295e-07, "loss": 0.503, "mean_token_accuracy": 0.8416097164154053, "num_tokens": 61629362.0, "step": 1613 }, { "epoch": 0.2053173896450833, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.79341697692871, "learning_rate": 6.837643069097074e-07, "loss": 0.5835, "mean_token_accuracy": 0.8189263939857483, "num_tokens": 61667037.0, "step": 1614 }, { "epoch": 0.20544459992367384, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.788026809692383, "learning_rate": 6.841882153454853e-07, "loss": 0.5022, "mean_token_accuracy": 0.8412510752677917, "num_tokens": 61701976.0, "step": 1615 }, { "epoch": 0.20557181020226434, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.756595611572266, "learning_rate": 6.846121237812632e-07, "loss": 0.4706, "mean_token_accuracy": 0.8532472252845764, "num_tokens": 61739068.0, "step": 1616 }, { "epoch": 0.20569902048085487, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.88457489013672, "learning_rate": 6.850360322170411e-07, "loss": 0.4836, "mean_token_accuracy": 0.8498131632804871, "num_tokens": 61781515.0, "step": 1617 }, { "epoch": 0.20582623075944537, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.802419662475586, "learning_rate": 6.85459940652819e-07, "loss": 0.4984, "mean_token_accuracy": 0.8410723805427551, "num_tokens": 61815201.0, "step": 1618 }, { "epoch": 0.20595344103803587, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.94368553161621, "learning_rate": 6.858838490885968e-07, "loss": 0.5565, "mean_token_accuracy": 0.8268841505050659, "num_tokens": 61856639.0, "step": 1619 }, { "epoch": 0.2060806513166264, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.758092880249023, "learning_rate": 6.863077575243748e-07, "loss": 0.5695, "mean_token_accuracy": 0.8270235061645508, "num_tokens": 61894313.0, "step": 1620 }, { "epoch": 0.2062078615952169, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.895246505737305, "learning_rate": 6.867316659601525e-07, "loss": 0.5046, "mean_token_accuracy": 0.8402643799781799, "num_tokens": 61930748.0, "step": 1621 }, { "epoch": 0.2063350718738074, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.7921085357666, "learning_rate": 6.871555743959304e-07, "loss": 0.5184, "mean_token_accuracy": 0.839226484298706, "num_tokens": 61972002.0, "step": 1622 }, { "epoch": 0.20646228215239792, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.843915939331055, "learning_rate": 6.875794828317083e-07, "loss": 0.4944, "mean_token_accuracy": 0.8465379476547241, "num_tokens": 62010141.0, "step": 1623 }, { "epoch": 0.20658949243098843, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 18.21469497680664, "learning_rate": 6.880033912674862e-07, "loss": 0.4834, "mean_token_accuracy": 0.851823091506958, "num_tokens": 62055907.0, "step": 1624 }, { "epoch": 0.20671670270957893, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.804113388061523, "learning_rate": 6.884272997032641e-07, "loss": 0.5904, "mean_token_accuracy": 0.8177916407585144, "num_tokens": 62095885.0, "step": 1625 }, { "epoch": 0.20684391298816945, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 18.147449493408203, "learning_rate": 6.88851208139042e-07, "loss": 0.482, "mean_token_accuracy": 0.8509050011634827, "num_tokens": 62130497.0, "step": 1626 }, { "epoch": 0.20697112326675995, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 18.427654266357422, "learning_rate": 6.892751165748198e-07, "loss": 0.5439, "mean_token_accuracy": 0.8306423425674438, "num_tokens": 62170296.0, "step": 1627 }, { "epoch": 0.20709833354535045, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 17.665794372558594, "learning_rate": 6.896990250105978e-07, "loss": 0.4773, "mean_token_accuracy": 0.8550132513046265, "num_tokens": 62210168.0, "step": 1628 }, { "epoch": 0.20722554382394098, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 18.171157836914062, "learning_rate": 6.901229334463755e-07, "loss": 0.5053, "mean_token_accuracy": 0.8414947986602783, "num_tokens": 62245251.0, "step": 1629 }, { "epoch": 0.20735275410253148, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 18.210554122924805, "learning_rate": 6.905468418821534e-07, "loss": 0.5032, "mean_token_accuracy": 0.8473230600357056, "num_tokens": 62286124.0, "step": 1630 }, { "epoch": 0.20747996438112198, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 18.00027847290039, "learning_rate": 6.909707503179313e-07, "loss": 0.5072, "mean_token_accuracy": 0.8391016125679016, "num_tokens": 62325604.0, "step": 1631 }, { "epoch": 0.2076071746597125, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 18.223114013671875, "learning_rate": 6.913946587537091e-07, "loss": 0.5045, "mean_token_accuracy": 0.8419549465179443, "num_tokens": 62364467.0, "step": 1632 }, { "epoch": 0.207734384938303, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.653366088867188, "learning_rate": 6.918185671894871e-07, "loss": 0.5763, "mean_token_accuracy": 0.8214139938354492, "num_tokens": 62402075.0, "step": 1633 }, { "epoch": 0.2078615952168935, "ewc_loss": 0.0235595703125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.71661376953125e-05, "grad_norm": 18.132354736328125, "learning_rate": 6.922424756252649e-07, "loss": 0.5428, "mean_token_accuracy": 0.8376888036727905, "num_tokens": 62437802.0, "step": 1634 }, { "epoch": 0.20798880549548404, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.851619720458984, "learning_rate": 6.926663840610428e-07, "loss": 0.4619, "mean_token_accuracy": 0.853996992111206, "num_tokens": 62476176.0, "step": 1635 }, { "epoch": 0.20811601577407454, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.762420654296875, "learning_rate": 6.930902924968206e-07, "loss": 0.5073, "mean_token_accuracy": 0.838679313659668, "num_tokens": 62514378.0, "step": 1636 }, { "epoch": 0.20824322605266504, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.98436737060547, "learning_rate": 6.935142009325985e-07, "loss": 0.5298, "mean_token_accuracy": 0.8351736068725586, "num_tokens": 62555302.0, "step": 1637 }, { "epoch": 0.20837043633125557, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.76467514038086, "learning_rate": 6.939381093683764e-07, "loss": 0.5484, "mean_token_accuracy": 0.829598069190979, "num_tokens": 62594356.0, "step": 1638 }, { "epoch": 0.20849764660984607, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.94961929321289, "learning_rate": 6.943620178041543e-07, "loss": 0.4652, "mean_token_accuracy": 0.8554313778877258, "num_tokens": 62632903.0, "step": 1639 }, { "epoch": 0.2086248568884366, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.689767837524414, "learning_rate": 6.947859262399321e-07, "loss": 0.5164, "mean_token_accuracy": 0.8454924821853638, "num_tokens": 62668979.0, "step": 1640 }, { "epoch": 0.2087520671670271, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.84646987915039, "learning_rate": 6.952098346757101e-07, "loss": 0.5018, "mean_token_accuracy": 0.843198299407959, "num_tokens": 62705867.0, "step": 1641 }, { "epoch": 0.2088792774456176, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.7523765563964844e-05, "grad_norm": 17.912351608276367, "learning_rate": 6.956337431114879e-07, "loss": 0.5004, "mean_token_accuracy": 0.8463618159294128, "num_tokens": 62744559.0, "step": 1642 }, { "epoch": 0.20900648772420813, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.670597076416016, "learning_rate": 6.960576515472658e-07, "loss": 0.529, "mean_token_accuracy": 0.8364283442497253, "num_tokens": 62785221.0, "step": 1643 }, { "epoch": 0.20913369800279863, "ewc_loss": 0.023681640625, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 1.728534698486328e-05, "grad_norm": 17.796554565429688, "learning_rate": 6.964815599830436e-07, "loss": 0.5182, "mean_token_accuracy": 0.8374959826469421, "num_tokens": 62820095.0, "step": 1644 }, { "epoch": 0.20926090828138913, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7404556274414062e-05, "grad_norm": 17.746427536010742, "learning_rate": 6.969054684188215e-07, "loss": 0.5705, "mean_token_accuracy": 0.819879949092865, "num_tokens": 62863414.0, "step": 1645 }, { "epoch": 0.20938811855997966, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7523765563964844e-05, "grad_norm": 17.849302291870117, "learning_rate": 6.973293768545994e-07, "loss": 0.4798, "mean_token_accuracy": 0.8504326343536377, "num_tokens": 62904647.0, "step": 1646 }, { "epoch": 0.20951532883857016, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7523765563964844e-05, "grad_norm": 17.785125732421875, "learning_rate": 6.977532852903773e-07, "loss": 0.5134, "mean_token_accuracy": 0.8386474847793579, "num_tokens": 62949476.0, "step": 1647 }, { "epoch": 0.20964253911716066, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7642974853515625e-05, "grad_norm": 18.08363151550293, "learning_rate": 6.981771937261551e-07, "loss": 0.5685, "mean_token_accuracy": 0.8239320516586304, "num_tokens": 62984784.0, "step": 1648 }, { "epoch": 0.2097697493957512, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7523765563964844e-05, "grad_norm": 17.843725204467773, "learning_rate": 6.986011021619331e-07, "loss": 0.5198, "mean_token_accuracy": 0.8390644788742065, "num_tokens": 63024395.0, "step": 1649 }, { "epoch": 0.2098969596743417, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 18.294164657592773, "learning_rate": 6.990250105977109e-07, "loss": 0.5376, "mean_token_accuracy": 0.8356049060821533, "num_tokens": 63061886.0, "step": 1650 }, { "epoch": 0.2100241699529322, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7523765563964844e-05, "grad_norm": 17.87934112548828, "learning_rate": 6.994489190334886e-07, "loss": 0.5058, "mean_token_accuracy": 0.8409086465835571, "num_tokens": 63104150.0, "step": 1651 }, { "epoch": 0.21015138023152272, "ewc_loss": 0.02392578125, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7404556274414062e-05, "grad_norm": 17.992740631103516, "learning_rate": 6.998728274692666e-07, "loss": 0.4667, "mean_token_accuracy": 0.8543250560760498, "num_tokens": 63144346.0, "step": 1652 }, { "epoch": 0.21027859051011322, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 17.929702758789062, "learning_rate": 7.002967359050444e-07, "loss": 0.5106, "mean_token_accuracy": 0.8467124104499817, "num_tokens": 63180042.0, "step": 1653 }, { "epoch": 0.21040580078870372, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7762184143066406e-05, "grad_norm": 18.069866180419922, "learning_rate": 7.007206443408224e-07, "loss": 0.4596, "mean_token_accuracy": 0.8543233275413513, "num_tokens": 63216570.0, "step": 1654 }, { "epoch": 0.21053301106729425, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 17.899049758911133, "learning_rate": 7.011445527766002e-07, "loss": 0.5112, "mean_token_accuracy": 0.841333270072937, "num_tokens": 63253457.0, "step": 1655 }, { "epoch": 0.21066022134588475, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7762184143066406e-05, "grad_norm": 17.98563575744629, "learning_rate": 7.015684612123781e-07, "loss": 0.4971, "mean_token_accuracy": 0.8433884382247925, "num_tokens": 63295430.0, "step": 1656 }, { "epoch": 0.21078743162447525, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 17.980859756469727, "learning_rate": 7.01992369648156e-07, "loss": 0.5179, "mean_token_accuracy": 0.8408814668655396, "num_tokens": 63340118.0, "step": 1657 }, { "epoch": 0.21091464190306577, "ewc_loss": 0.024169921875, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 17.984933853149414, "learning_rate": 7.024162780839339e-07, "loss": 0.55, "mean_token_accuracy": 0.8224808573722839, "num_tokens": 63372037.0, "step": 1658 }, { "epoch": 0.21104185218165628, "ewc_loss": 0.0242919921875, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 18.24896240234375, "learning_rate": 7.028401865197116e-07, "loss": 0.5407, "mean_token_accuracy": 0.8297984600067139, "num_tokens": 63415508.0, "step": 1659 }, { "epoch": 0.21116906246024678, "ewc_loss": 0.0244140625, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 1.800060272216797e-05, "grad_norm": 17.952247619628906, "learning_rate": 7.032640949554896e-07, "loss": 0.4476, "mean_token_accuracy": 0.8564967513084412, "num_tokens": 63459320.0, "step": 1660 }, { "epoch": 0.2112962727388373, "ewc_loss": 0.0244140625, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 1.800060272216797e-05, "grad_norm": 18.1656551361084, "learning_rate": 7.036880033912674e-07, "loss": 0.5493, "mean_token_accuracy": 0.8311240673065186, "num_tokens": 63499624.0, "step": 1661 }, { "epoch": 0.2114234830174278, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.16473960876465, "learning_rate": 7.041119118270454e-07, "loss": 0.5081, "mean_token_accuracy": 0.8405512571334839, "num_tokens": 63536507.0, "step": 1662 }, { "epoch": 0.2115506932960183, "ewc_loss": 0.0244140625, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.800060272216797e-05, "grad_norm": 18.470006942749023, "learning_rate": 7.045358202628232e-07, "loss": 0.5476, "mean_token_accuracy": 0.8325159549713135, "num_tokens": 63576483.0, "step": 1663 }, { "epoch": 0.21167790357460883, "ewc_loss": 0.0242919921875, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.7881393432617188e-05, "grad_norm": 18.23747444152832, "learning_rate": 7.049597286986011e-07, "loss": 0.5369, "mean_token_accuracy": 0.8319711089134216, "num_tokens": 63615568.0, "step": 1664 }, { "epoch": 0.21180511385319933, "ewc_loss": 0.0244140625, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.800060272216797e-05, "grad_norm": 18.2872371673584, "learning_rate": 7.05383637134379e-07, "loss": 0.4979, "mean_token_accuracy": 0.8447448015213013, "num_tokens": 63650023.0, "step": 1665 }, { "epoch": 0.21193232413178986, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.33150291442871, "learning_rate": 7.058075455701568e-07, "loss": 0.5271, "mean_token_accuracy": 0.8313829898834229, "num_tokens": 63690229.0, "step": 1666 }, { "epoch": 0.21205953441038036, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.105558395385742, "learning_rate": 7.062314540059346e-07, "loss": 0.47, "mean_token_accuracy": 0.8556421995162964, "num_tokens": 63728169.0, "step": 1667 }, { "epoch": 0.21218674468897086, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.465347290039062, "learning_rate": 7.066553624417126e-07, "loss": 0.4968, "mean_token_accuracy": 0.8436208963394165, "num_tokens": 63762816.0, "step": 1668 }, { "epoch": 0.2123139549675614, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.5267086029052734e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.29098129272461, "learning_rate": 7.070792708774904e-07, "loss": 0.5188, "mean_token_accuracy": 0.8387647867202759, "num_tokens": 63804288.0, "step": 1669 }, { "epoch": 0.2124411652461519, "ewc_loss": 0.0244140625, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.800060272216797e-05, "grad_norm": 17.956562042236328, "learning_rate": 7.075031793132684e-07, "loss": 0.5387, "mean_token_accuracy": 0.8362329602241516, "num_tokens": 63851365.0, "step": 1670 }, { "epoch": 0.2125683755247424, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.176111221313477, "learning_rate": 7.079270877490462e-07, "loss": 0.5009, "mean_token_accuracy": 0.8468120098114014, "num_tokens": 63886187.0, "step": 1671 }, { "epoch": 0.21269558580333292, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.264080047607422, "learning_rate": 7.08350996184824e-07, "loss": 0.5743, "mean_token_accuracy": 0.8234132528305054, "num_tokens": 63927187.0, "step": 1672 }, { "epoch": 0.21282279608192342, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.138063430786133, "learning_rate": 7.08774904620602e-07, "loss": 0.4945, "mean_token_accuracy": 0.8493094444274902, "num_tokens": 63964088.0, "step": 1673 }, { "epoch": 0.21295000636051392, "ewc_loss": 0.0245361328125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.811981201171875e-05, "grad_norm": 18.088844299316406, "learning_rate": 7.091988130563797e-07, "loss": 0.5405, "mean_token_accuracy": 0.8327972888946533, "num_tokens": 64004931.0, "step": 1674 }, { "epoch": 0.21307721663910445, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.307876586914062, "learning_rate": 7.096227214921576e-07, "loss": 0.5339, "mean_token_accuracy": 0.8345423340797424, "num_tokens": 64043966.0, "step": 1675 }, { "epoch": 0.21320442691769495, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.23567008972168, "learning_rate": 7.100466299279355e-07, "loss": 0.5157, "mean_token_accuracy": 0.8403607606887817, "num_tokens": 64078886.0, "step": 1676 }, { "epoch": 0.21333163719628545, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.151004791259766, "learning_rate": 7.104705383637134e-07, "loss": 0.5404, "mean_token_accuracy": 0.8382078409194946, "num_tokens": 64117291.0, "step": 1677 }, { "epoch": 0.21345884747487598, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.240270614624023, "learning_rate": 7.108944467994913e-07, "loss": 0.4954, "mean_token_accuracy": 0.8459447622299194, "num_tokens": 64156369.0, "step": 1678 }, { "epoch": 0.21358605775346648, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.18103790283203, "learning_rate": 7.113183552352692e-07, "loss": 0.5698, "mean_token_accuracy": 0.8194520473480225, "num_tokens": 64193646.0, "step": 1679 }, { "epoch": 0.21371326803205698, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.214048385620117, "learning_rate": 7.11742263671047e-07, "loss": 0.4609, "mean_token_accuracy": 0.8563570976257324, "num_tokens": 64231933.0, "step": 1680 }, { "epoch": 0.2138404783106475, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.276649475097656, "learning_rate": 7.12166172106825e-07, "loss": 0.5953, "mean_token_accuracy": 0.8134602904319763, "num_tokens": 64267418.0, "step": 1681 }, { "epoch": 0.213967688589238, "ewc_loss": 0.0247802734375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.01909637451172, "learning_rate": 7.125900805426027e-07, "loss": 0.4746, "mean_token_accuracy": 0.8514499068260193, "num_tokens": 64302440.0, "step": 1682 }, { "epoch": 0.2140948988678285, "ewc_loss": 0.024658203125, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.823902130126953e-05, "grad_norm": 18.187416076660156, "learning_rate": 7.130139889783806e-07, "loss": 0.515, "mean_token_accuracy": 0.8401548862457275, "num_tokens": 64338240.0, "step": 1683 }, { "epoch": 0.21422210914641904, "ewc_loss": 0.0247802734375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.24813461303711, "learning_rate": 7.134378974141585e-07, "loss": 0.5054, "mean_token_accuracy": 0.8412832021713257, "num_tokens": 64375734.0, "step": 1684 }, { "epoch": 0.21434931942500954, "ewc_loss": 0.0247802734375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.12596893310547, "learning_rate": 7.138618058499364e-07, "loss": 0.4991, "mean_token_accuracy": 0.8450014591217041, "num_tokens": 64411401.0, "step": 1685 }, { "epoch": 0.21447652970360004, "ewc_loss": 0.0247802734375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.293052673339844, "learning_rate": 7.142857142857143e-07, "loss": 0.4746, "mean_token_accuracy": 0.8530802726745605, "num_tokens": 64454719.0, "step": 1686 }, { "epoch": 0.21460373998219057, "ewc_loss": 0.0247802734375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.1737117767334, "learning_rate": 7.147096227214922e-07, "loss": 0.4993, "mean_token_accuracy": 0.8457895517349243, "num_tokens": 64495736.0, "step": 1687 }, { "epoch": 0.21473095026078107, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.51700210571289, "learning_rate": 7.1513353115727e-07, "loss": 0.48, "mean_token_accuracy": 0.849611759185791, "num_tokens": 64532843.0, "step": 1688 }, { "epoch": 0.21485816053937157, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.199914932250977, "learning_rate": 7.155574395930479e-07, "loss": 0.527, "mean_token_accuracy": 0.8318731784820557, "num_tokens": 64568950.0, "step": 1689 }, { "epoch": 0.2149853708179621, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.484699249267578, "learning_rate": 7.159813480288257e-07, "loss": 0.5023, "mean_token_accuracy": 0.840679407119751, "num_tokens": 64606878.0, "step": 1690 }, { "epoch": 0.2151125810965526, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.171932220458984, "learning_rate": 7.164052564646035e-07, "loss": 0.468, "mean_token_accuracy": 0.8535171747207642, "num_tokens": 64643012.0, "step": 1691 }, { "epoch": 0.21523979137514312, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.490625381469727, "learning_rate": 7.168291649003815e-07, "loss": 0.4908, "mean_token_accuracy": 0.8508827090263367, "num_tokens": 64684444.0, "step": 1692 }, { "epoch": 0.21536700165373363, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.333740234375, "learning_rate": 7.172530733361593e-07, "loss": 0.542, "mean_token_accuracy": 0.8325089812278748, "num_tokens": 64722068.0, "step": 1693 }, { "epoch": 0.21549421193232413, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.51679229736328, "learning_rate": 7.176769817719373e-07, "loss": 0.4529, "mean_token_accuracy": 0.8644272685050964, "num_tokens": 64761829.0, "step": 1694 }, { "epoch": 0.21562142221091465, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.263944625854492, "learning_rate": 7.181008902077151e-07, "loss": 0.498, "mean_token_accuracy": 0.8458395004272461, "num_tokens": 64799185.0, "step": 1695 }, { "epoch": 0.21574863248950515, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.573362350463867, "learning_rate": 7.18524798643493e-07, "loss": 0.5294, "mean_token_accuracy": 0.8374912738800049, "num_tokens": 64839306.0, "step": 1696 }, { "epoch": 0.21587584276809565, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.26462173461914, "learning_rate": 7.189487070792708e-07, "loss": 0.5028, "mean_token_accuracy": 0.845585823059082, "num_tokens": 64876011.0, "step": 1697 }, { "epoch": 0.21600305304668618, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.42517852783203, "learning_rate": 7.193726155150487e-07, "loss": 0.503, "mean_token_accuracy": 0.8462929725646973, "num_tokens": 64918510.0, "step": 1698 }, { "epoch": 0.21613026332527668, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.324674606323242, "learning_rate": 7.197965239508265e-07, "loss": 0.5096, "mean_token_accuracy": 0.8425095081329346, "num_tokens": 64957156.0, "step": 1699 }, { "epoch": 0.21625747360386718, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.8414249420166, "learning_rate": 7.202204323866045e-07, "loss": 0.5085, "mean_token_accuracy": 0.844141960144043, "num_tokens": 64998761.0, "step": 1700 }, { "epoch": 0.2163846838824577, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.194658279418945, "learning_rate": 7.206443408223823e-07, "loss": 0.5121, "mean_token_accuracy": 0.8393474817276001, "num_tokens": 65038849.0, "step": 1701 }, { "epoch": 0.2165118941610482, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 19.475618362426758, "learning_rate": 7.210682492581603e-07, "loss": 0.4916, "mean_token_accuracy": 0.84409099817276, "num_tokens": 65071130.0, "step": 1702 }, { "epoch": 0.2166391044396387, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.480587005615234, "learning_rate": 7.214921576939381e-07, "loss": 0.5341, "mean_token_accuracy": 0.8314419984817505, "num_tokens": 65111619.0, "step": 1703 }, { "epoch": 0.21676631471822924, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.336156845092773, "learning_rate": 7.219160661297159e-07, "loss": 0.4805, "mean_token_accuracy": 0.8496876955032349, "num_tokens": 65147537.0, "step": 1704 }, { "epoch": 0.21689352499681974, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 19.355758666992188, "learning_rate": 7.223399745654938e-07, "loss": 0.5165, "mean_token_accuracy": 0.8385074138641357, "num_tokens": 65187254.0, "step": 1705 }, { "epoch": 0.21702073527541024, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.86862564086914, "learning_rate": 7.227638830012717e-07, "loss": 0.5278, "mean_token_accuracy": 0.8332049250602722, "num_tokens": 65226130.0, "step": 1706 }, { "epoch": 0.21714794555400077, "ewc_loss": 0.02490234375, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 1.8358230590820312e-05, "grad_norm": 18.33083152770996, "learning_rate": 7.231877914370495e-07, "loss": 0.4917, "mean_token_accuracy": 0.8494221568107605, "num_tokens": 65261282.0, "step": 1707 }, { "epoch": 0.21727515583259127, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.68860626220703, "learning_rate": 7.236116998728275e-07, "loss": 0.5426, "mean_token_accuracy": 0.830136239528656, "num_tokens": 65294850.0, "step": 1708 }, { "epoch": 0.21740236611118177, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.865161895751953, "learning_rate": 7.240356083086053e-07, "loss": 0.4339, "mean_token_accuracy": 0.8633524179458618, "num_tokens": 65332323.0, "step": 1709 }, { "epoch": 0.2175295763897723, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.253097534179688, "learning_rate": 7.244595167443833e-07, "loss": 0.5294, "mean_token_accuracy": 0.8365375399589539, "num_tokens": 65376033.0, "step": 1710 }, { "epoch": 0.2176567866683628, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.305282592773438, "learning_rate": 7.248834251801611e-07, "loss": 0.4933, "mean_token_accuracy": 0.8481945991516113, "num_tokens": 65417875.0, "step": 1711 }, { "epoch": 0.2177839969469533, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.53284454345703, "learning_rate": 7.253073336159388e-07, "loss": 0.4697, "mean_token_accuracy": 0.8529980778694153, "num_tokens": 65456675.0, "step": 1712 }, { "epoch": 0.21791120722554383, "ewc_loss": 0.025390625, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.777931213378906, "learning_rate": 7.257312420517168e-07, "loss": 0.4628, "mean_token_accuracy": 0.8542598485946655, "num_tokens": 65487699.0, "step": 1713 }, { "epoch": 0.21803841750413433, "ewc_loss": 0.0250244140625, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8477439880371094e-05, "grad_norm": 18.481929779052734, "learning_rate": 7.261551504874946e-07, "loss": 0.4894, "mean_token_accuracy": 0.8446367383003235, "num_tokens": 65526143.0, "step": 1714 }, { "epoch": 0.21816562778272486, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.327816009521484, "learning_rate": 7.265790589232725e-07, "loss": 0.4872, "mean_token_accuracy": 0.8490958213806152, "num_tokens": 65562969.0, "step": 1715 }, { "epoch": 0.21829283806131536, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.73475456237793, "learning_rate": 7.270029673590504e-07, "loss": 0.5194, "mean_token_accuracy": 0.8408808708190918, "num_tokens": 65605999.0, "step": 1716 }, { "epoch": 0.21842004833990586, "ewc_loss": 0.025390625, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.361421585083008, "learning_rate": 7.274268757948283e-07, "loss": 0.5649, "mean_token_accuracy": 0.8265394568443298, "num_tokens": 65644877.0, "step": 1717 }, { "epoch": 0.2185472586184964, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.69501304626465, "learning_rate": 7.278507842306062e-07, "loss": 0.4846, "mean_token_accuracy": 0.8506261706352234, "num_tokens": 65678823.0, "step": 1718 }, { "epoch": 0.2186744688970869, "ewc_loss": 0.025390625, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8715858459472656e-05, "grad_norm": 18.505205154418945, "learning_rate": 7.282746926663841e-07, "loss": 0.5654, "mean_token_accuracy": 0.8272676467895508, "num_tokens": 65717185.0, "step": 1719 }, { "epoch": 0.2188016791756774, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.453073501586914, "learning_rate": 7.286986011021618e-07, "loss": 0.4857, "mean_token_accuracy": 0.8495041728019714, "num_tokens": 65754422.0, "step": 1720 }, { "epoch": 0.21892888945426792, "ewc_loss": 0.025390625, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.49742889404297, "learning_rate": 7.291225095379398e-07, "loss": 0.5221, "mean_token_accuracy": 0.8394379615783691, "num_tokens": 65799390.0, "step": 1721 }, { "epoch": 0.21905609973285842, "ewc_loss": 0.025146484375, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.8596649169921875e-05, "grad_norm": 18.456710815429688, "learning_rate": 7.295464179737176e-07, "loss": 0.5148, "mean_token_accuracy": 0.8435982465744019, "num_tokens": 65837566.0, "step": 1722 }, { "epoch": 0.21918331001144892, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.504623413085938, "learning_rate": 7.299703264094955e-07, "loss": 0.5004, "mean_token_accuracy": 0.845443069934845, "num_tokens": 65874537.0, "step": 1723 }, { "epoch": 0.21931052029003945, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.542987823486328, "learning_rate": 7.303942348452734e-07, "loss": 0.5268, "mean_token_accuracy": 0.8368774652481079, "num_tokens": 65915616.0, "step": 1724 }, { "epoch": 0.21943773056862995, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.44667625427246, "learning_rate": 7.308181432810513e-07, "loss": 0.4747, "mean_token_accuracy": 0.8502564430236816, "num_tokens": 65954263.0, "step": 1725 }, { "epoch": 0.21956494084722045, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.339021682739258, "learning_rate": 7.312420517168292e-07, "loss": 0.5071, "mean_token_accuracy": 0.8401779532432556, "num_tokens": 65992880.0, "step": 1726 }, { "epoch": 0.21969215112581097, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.488431930541992, "learning_rate": 7.31665960152607e-07, "loss": 0.5256, "mean_token_accuracy": 0.8328714370727539, "num_tokens": 66033958.0, "step": 1727 }, { "epoch": 0.21981936140440148, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.645917892456055e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.422536849975586, "learning_rate": 7.320898685883848e-07, "loss": 0.4964, "mean_token_accuracy": 0.8446683883666992, "num_tokens": 66072448.0, "step": 1728 }, { "epoch": 0.21994657168299198, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.609582901000977, "learning_rate": 7.325137770241628e-07, "loss": 0.6078, "mean_token_accuracy": 0.8140748739242554, "num_tokens": 66112648.0, "step": 1729 }, { "epoch": 0.2200737819615825, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.48052978515625, "learning_rate": 7.329376854599406e-07, "loss": 0.5195, "mean_token_accuracy": 0.8396480083465576, "num_tokens": 66154791.0, "step": 1730 }, { "epoch": 0.220200992240173, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.63621711730957, "learning_rate": 7.333615938957184e-07, "loss": 0.5094, "mean_token_accuracy": 0.8410258293151855, "num_tokens": 66192450.0, "step": 1731 }, { "epoch": 0.2203282025187635, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.782901763916016, "learning_rate": 7.337855023314964e-07, "loss": 0.5022, "mean_token_accuracy": 0.8422406911849976, "num_tokens": 66231173.0, "step": 1732 }, { "epoch": 0.22045541279735403, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.919269561767578e-05, "grad_norm": 18.971572875976562, "learning_rate": 7.342094107672742e-07, "loss": 0.5555, "mean_token_accuracy": 0.8267025947570801, "num_tokens": 66272956.0, "step": 1733 }, { "epoch": 0.22058262307594453, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.62592887878418, "learning_rate": 7.346333192030522e-07, "loss": 0.5093, "mean_token_accuracy": 0.8464849591255188, "num_tokens": 66309450.0, "step": 1734 }, { "epoch": 0.22070983335453503, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.65760612487793, "learning_rate": 7.350572276388299e-07, "loss": 0.5598, "mean_token_accuracy": 0.8241496086120605, "num_tokens": 66350947.0, "step": 1735 }, { "epoch": 0.22083704363312556, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 19.236928939819336, "learning_rate": 7.354811360746078e-07, "loss": 0.5173, "mean_token_accuracy": 0.8367159366607666, "num_tokens": 66388364.0, "step": 1736 }, { "epoch": 0.22096425391171606, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.21286964416504, "learning_rate": 7.359050445103857e-07, "loss": 0.5315, "mean_token_accuracy": 0.8339261412620544, "num_tokens": 66425029.0, "step": 1737 }, { "epoch": 0.22109146419030656, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 19.294694900512695, "learning_rate": 7.363289529461636e-07, "loss": 0.5071, "mean_token_accuracy": 0.8431786298751831, "num_tokens": 66463924.0, "step": 1738 }, { "epoch": 0.2212186744688971, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.919269561767578e-05, "grad_norm": 19.031591415405273, "learning_rate": 7.367528613819415e-07, "loss": 0.4623, "mean_token_accuracy": 0.855601966381073, "num_tokens": 66504824.0, "step": 1739 }, { "epoch": 0.2213458847474876, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.790205001831055, "learning_rate": 7.371767698177194e-07, "loss": 0.4807, "mean_token_accuracy": 0.8500166535377502, "num_tokens": 66539308.0, "step": 1740 }, { "epoch": 0.22147309502607812, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.86525535583496, "learning_rate": 7.376006782534972e-07, "loss": 0.4772, "mean_token_accuracy": 0.8516169190406799, "num_tokens": 66572785.0, "step": 1741 }, { "epoch": 0.22160030530466862, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.7081298828125, "learning_rate": 7.380245866892751e-07, "loss": 0.4945, "mean_token_accuracy": 0.8452885150909424, "num_tokens": 66607890.0, "step": 1742 }, { "epoch": 0.22172751558325912, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 19.001853942871094, "learning_rate": 7.384484951250529e-07, "loss": 0.4973, "mean_token_accuracy": 0.8475654125213623, "num_tokens": 66650553.0, "step": 1743 }, { "epoch": 0.22185472586184965, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.7701416015625, "learning_rate": 7.388724035608308e-07, "loss": 0.4899, "mean_token_accuracy": 0.8491432070732117, "num_tokens": 66685330.0, "step": 1744 }, { "epoch": 0.22198193614044015, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.705522537231445e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.837329864501953, "learning_rate": 7.392963119966087e-07, "loss": 0.4739, "mean_token_accuracy": 0.8513482809066772, "num_tokens": 66716816.0, "step": 1745 }, { "epoch": 0.22210914641903065, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.956026077270508, "learning_rate": 7.397202204323866e-07, "loss": 0.5191, "mean_token_accuracy": 0.8396563529968262, "num_tokens": 66754030.0, "step": 1746 }, { "epoch": 0.22223635669762118, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.907421112060547, "learning_rate": 7.401441288681645e-07, "loss": 0.4757, "mean_token_accuracy": 0.8521996736526489, "num_tokens": 66793650.0, "step": 1747 }, { "epoch": 0.22236356697621168, "ewc_loss": 0.0255126953125, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.8835067749023438e-05, "grad_norm": 18.75836753845215, "learning_rate": 7.405680373039424e-07, "loss": 0.4601, "mean_token_accuracy": 0.8598619699478149, "num_tokens": 66833709.0, "step": 1748 }, { "epoch": 0.22249077725480218, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.919269561767578e-05, "grad_norm": 18.832977294921875, "learning_rate": 7.409919457397202e-07, "loss": 0.53, "mean_token_accuracy": 0.834469199180603, "num_tokens": 66866456.0, "step": 1749 }, { "epoch": 0.2226179875333927, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.919448852539062, "learning_rate": 7.414158541754981e-07, "loss": 0.5552, "mean_token_accuracy": 0.8266646862030029, "num_tokens": 66905875.0, "step": 1750 }, { "epoch": 0.2227451978119832, "ewc_loss": 0.0257568359375, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.828876495361328, "learning_rate": 7.418397626112759e-07, "loss": 0.5088, "mean_token_accuracy": 0.8502441644668579, "num_tokens": 66936009.0, "step": 1751 }, { "epoch": 0.2228724080905737, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 1.919269561767578e-05, "grad_norm": 18.746530532836914, "learning_rate": 7.422636710470537e-07, "loss": 0.5149, "mean_token_accuracy": 0.8410117030143738, "num_tokens": 66978111.0, "step": 1752 }, { "epoch": 0.22299961836916424, "ewc_loss": 0.025634765625, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 1.895427703857422e-05, "grad_norm": 18.930795669555664, "learning_rate": 7.426875794828317e-07, "loss": 0.5278, "mean_token_accuracy": 0.8401840925216675, "num_tokens": 67015938.0, "step": 1753 }, { "epoch": 0.22312682864775474, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.57474708557129, "learning_rate": 7.431114879186095e-07, "loss": 0.5474, "mean_token_accuracy": 0.8311930298805237, "num_tokens": 67055456.0, "step": 1754 }, { "epoch": 0.22325403892634524, "ewc_loss": 0.0260009765625, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.919269561767578e-05, "grad_norm": 19.063302993774414, "learning_rate": 7.435353963543875e-07, "loss": 0.5042, "mean_token_accuracy": 0.8436055183410645, "num_tokens": 67096298.0, "step": 1755 }, { "epoch": 0.22338124920493577, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 18.556884765625, "learning_rate": 7.439593047901653e-07, "loss": 0.4591, "mean_token_accuracy": 0.8562396764755249, "num_tokens": 67135280.0, "step": 1756 }, { "epoch": 0.22350845948352627, "ewc_loss": 0.02587890625, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9073486328125e-05, "grad_norm": 19.02792739868164, "learning_rate": 7.443832132259431e-07, "loss": 0.5246, "mean_token_accuracy": 0.8356043100357056, "num_tokens": 67171936.0, "step": 1757 }, { "epoch": 0.22363566976211677, "ewc_loss": 0.0262451171875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9431114196777344e-05, "grad_norm": 18.633621215820312, "learning_rate": 7.44807121661721e-07, "loss": 0.5139, "mean_token_accuracy": 0.8404344320297241, "num_tokens": 67209854.0, "step": 1758 }, { "epoch": 0.2237628800407073, "ewc_loss": 0.0263671875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.89686393737793, "learning_rate": 7.452310300974989e-07, "loss": 0.4682, "mean_token_accuracy": 0.8504295945167542, "num_tokens": 67244369.0, "step": 1759 }, { "epoch": 0.2238900903192978, "ewc_loss": 0.026123046875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9311904907226562e-05, "grad_norm": 18.91385841369629, "learning_rate": 7.456549385332767e-07, "loss": 0.5471, "mean_token_accuracy": 0.8313510417938232, "num_tokens": 67283994.0, "step": 1760 }, { "epoch": 0.2240173005978883, "ewc_loss": 0.0262451171875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9431114196777344e-05, "grad_norm": 18.580568313598633, "learning_rate": 7.460788469690547e-07, "loss": 0.431, "mean_token_accuracy": 0.8635919094085693, "num_tokens": 67323230.0, "step": 1761 }, { "epoch": 0.22414451087647883, "ewc_loss": 0.0263671875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.938865661621094, "learning_rate": 7.465027554048325e-07, "loss": 0.5873, "mean_token_accuracy": 0.8214453458786011, "num_tokens": 67366198.0, "step": 1762 }, { "epoch": 0.22427172115506933, "ewc_loss": 0.0262451171875, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 1.9431114196777344e-05, "grad_norm": 18.386798858642578, "learning_rate": 7.469266638406105e-07, "loss": 0.57, "mean_token_accuracy": 0.826991617679596, "num_tokens": 67410685.0, "step": 1763 }, { "epoch": 0.22439893143365983, "ewc_loss": 0.0264892578125, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 18.87735939025879, "learning_rate": 7.473505722763883e-07, "loss": 0.4918, "mean_token_accuracy": 0.8470340967178345, "num_tokens": 67445818.0, "step": 1764 }, { "epoch": 0.22452614171225035, "ewc_loss": 0.0263671875, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.81222152709961, "learning_rate": 7.477744807121661e-07, "loss": 0.5347, "mean_token_accuracy": 0.8350646495819092, "num_tokens": 67483873.0, "step": 1765 }, { "epoch": 0.22465335199084085, "ewc_loss": 0.0264892578125, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 19.179485321044922, "learning_rate": 7.48198389147944e-07, "loss": 0.5131, "mean_token_accuracy": 0.8449755907058716, "num_tokens": 67519524.0, "step": 1766 }, { "epoch": 0.22478056226943138, "ewc_loss": 0.0264892578125, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 18.817670822143555, "learning_rate": 7.486222975837219e-07, "loss": 0.4975, "mean_token_accuracy": 0.8435407876968384, "num_tokens": 67560920.0, "step": 1767 }, { "epoch": 0.22490777254802188, "ewc_loss": 0.0263671875, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.837533950805664, "learning_rate": 7.490462060194997e-07, "loss": 0.4666, "mean_token_accuracy": 0.8510546684265137, "num_tokens": 67594948.0, "step": 1768 }, { "epoch": 0.22503498282661238, "ewc_loss": 0.026611328125, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.9788742065429688e-05, "grad_norm": 18.839380264282227, "learning_rate": 7.494701144552777e-07, "loss": 0.558, "mean_token_accuracy": 0.8271111845970154, "num_tokens": 67628867.0, "step": 1769 }, { "epoch": 0.2251621931052029, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 19.222389221191406, "learning_rate": 7.498940228910555e-07, "loss": 0.517, "mean_token_accuracy": 0.837658166885376, "num_tokens": 67669762.0, "step": 1770 }, { "epoch": 0.2252894033837934, "ewc_loss": 0.0263671875, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.940902709960938, "learning_rate": 7.503179313268335e-07, "loss": 0.5053, "mean_token_accuracy": 0.8401466608047485, "num_tokens": 67704750.0, "step": 1771 }, { "epoch": 0.2254166136623839, "ewc_loss": 0.026611328125, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 18.790348052978516, "learning_rate": 7.507418397626113e-07, "loss": 0.5429, "mean_token_accuracy": 0.8317030072212219, "num_tokens": 67745181.0, "step": 1772 }, { "epoch": 0.22554382394097444, "ewc_loss": 0.026611328125, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 18.97437858581543, "learning_rate": 7.51165748198389e-07, "loss": 0.5292, "mean_token_accuracy": 0.8344156742095947, "num_tokens": 67785929.0, "step": 1773 }, { "epoch": 0.22567103421956494, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.884336471557617e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 18.9537296295166, "learning_rate": 7.51589656634167e-07, "loss": 0.4835, "mean_token_accuracy": 0.845319390296936, "num_tokens": 67821887.0, "step": 1774 }, { "epoch": 0.22579824449815544, "ewc_loss": 0.0264892578125, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 1.9550323486328125e-05, "grad_norm": 18.954204559326172, "learning_rate": 7.520135650699448e-07, "loss": 0.5179, "mean_token_accuracy": 0.8388082981109619, "num_tokens": 67859608.0, "step": 1775 }, { "epoch": 0.22592545477674597, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 18.836654663085938, "learning_rate": 7.524374735057227e-07, "loss": 0.5281, "mean_token_accuracy": 0.8378293514251709, "num_tokens": 67895287.0, "step": 1776 }, { "epoch": 0.22605266505533647, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.048389434814453, "learning_rate": 7.528613819415006e-07, "loss": 0.4848, "mean_token_accuracy": 0.8529200553894043, "num_tokens": 67930358.0, "step": 1777 }, { "epoch": 0.22617987533392697, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 18.886720657348633, "learning_rate": 7.532852903772785e-07, "loss": 0.527, "mean_token_accuracy": 0.8351439833641052, "num_tokens": 67966112.0, "step": 1778 }, { "epoch": 0.2263070856125175, "ewc_loss": 0.026611328125, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 1.9669532775878906e-05, "grad_norm": 18.83580207824707, "learning_rate": 7.537091988130564e-07, "loss": 0.5351, "mean_token_accuracy": 0.8319675922393799, "num_tokens": 68001297.0, "step": 1779 }, { "epoch": 0.226434295891108, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.30942726135254, "learning_rate": 7.541331072488342e-07, "loss": 0.516, "mean_token_accuracy": 0.8393172025680542, "num_tokens": 68037346.0, "step": 1780 }, { "epoch": 0.2265615061696985, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 18.80364990234375, "learning_rate": 7.54557015684612e-07, "loss": 0.5135, "mean_token_accuracy": 0.8407607078552246, "num_tokens": 68076566.0, "step": 1781 }, { "epoch": 0.22668871644828903, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 19.220550537109375, "learning_rate": 7.5498092412039e-07, "loss": 0.4913, "mean_token_accuracy": 0.8499994874000549, "num_tokens": 68116834.0, "step": 1782 }, { "epoch": 0.22681592672687953, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 19.221044540405273, "learning_rate": 7.554048325561678e-07, "loss": 0.4675, "mean_token_accuracy": 0.8554897904396057, "num_tokens": 68155573.0, "step": 1783 }, { "epoch": 0.22694313700547003, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.02465057373047, "learning_rate": 7.558287409919457e-07, "loss": 0.5619, "mean_token_accuracy": 0.8285416960716248, "num_tokens": 68193378.0, "step": 1784 }, { "epoch": 0.22707034728406056, "ewc_loss": 0.0267333984375, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.9788742065429688e-05, "grad_norm": 18.964223861694336, "learning_rate": 7.562526494277236e-07, "loss": 0.5326, "mean_token_accuracy": 0.8320423364639282, "num_tokens": 68230183.0, "step": 1785 }, { "epoch": 0.22719755756265106, "ewc_loss": 0.0267333984375, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.9788742065429688e-05, "grad_norm": 18.97960662841797, "learning_rate": 7.566765578635015e-07, "loss": 0.5028, "mean_token_accuracy": 0.8449503779411316, "num_tokens": 68270526.0, "step": 1786 }, { "epoch": 0.22732476784124156, "ewc_loss": 0.02685546875, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 19.06097412109375, "learning_rate": 7.571004662992794e-07, "loss": 0.5272, "mean_token_accuracy": 0.8397976160049438, "num_tokens": 68308690.0, "step": 1787 }, { "epoch": 0.2274519781198321, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 18.975433349609375, "learning_rate": 7.575243747350572e-07, "loss": 0.5706, "mean_token_accuracy": 0.8254879713058472, "num_tokens": 68348660.0, "step": 1788 }, { "epoch": 0.2275791883984226, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.013944625854492, "learning_rate": 7.57948283170835e-07, "loss": 0.5191, "mean_token_accuracy": 0.842127799987793, "num_tokens": 68389129.0, "step": 1789 }, { "epoch": 0.2277063986770131, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 18.86858367919922, "learning_rate": 7.58372191606613e-07, "loss": 0.5102, "mean_token_accuracy": 0.8435119390487671, "num_tokens": 68423234.0, "step": 1790 }, { "epoch": 0.22783360895560362, "ewc_loss": 0.02685546875, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 1.990795135498047e-05, "grad_norm": 19.009689331054688, "learning_rate": 7.587961000423908e-07, "loss": 0.5177, "mean_token_accuracy": 0.8407986164093018, "num_tokens": 68460834.0, "step": 1791 }, { "epoch": 0.22796081923419412, "ewc_loss": 0.027099609375, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 19.14109992980957, "learning_rate": 7.592200084781686e-07, "loss": 0.5622, "mean_token_accuracy": 0.8300977945327759, "num_tokens": 68495605.0, "step": 1792 }, { "epoch": 0.22808802951278465, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 18.82659339904785, "learning_rate": 7.596439169139466e-07, "loss": 0.508, "mean_token_accuracy": 0.8426755666732788, "num_tokens": 68530838.0, "step": 1793 }, { "epoch": 0.22821523979137515, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.103429794311523, "learning_rate": 7.600678253497244e-07, "loss": 0.5021, "mean_token_accuracy": 0.8465830087661743, "num_tokens": 68571225.0, "step": 1794 }, { "epoch": 0.22834245006996565, "ewc_loss": 0.027099609375, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 19.289724349975586, "learning_rate": 7.604917337855023e-07, "loss": 0.4848, "mean_token_accuracy": 0.8525725603103638, "num_tokens": 68608284.0, "step": 1795 }, { "epoch": 0.22846966034855618, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.06070327758789, "learning_rate": 7.609156422212801e-07, "loss": 0.4655, "mean_token_accuracy": 0.8558120727539062, "num_tokens": 68645943.0, "step": 1796 }, { "epoch": 0.22859687062714668, "ewc_loss": 0.027099609375, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 18.97104835510254, "learning_rate": 7.61339550657058e-07, "loss": 0.489, "mean_token_accuracy": 0.8470144271850586, "num_tokens": 68684245.0, "step": 1797 }, { "epoch": 0.22872408090573718, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.27619171142578, "learning_rate": 7.617634590928359e-07, "loss": 0.503, "mean_token_accuracy": 0.8476213216781616, "num_tokens": 68717532.0, "step": 1798 }, { "epoch": 0.2288512911843277, "ewc_loss": 0.02734375, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 2.0384788513183594e-05, "grad_norm": 18.885290145874023, "learning_rate": 7.621873675286138e-07, "loss": 0.5214, "mean_token_accuracy": 0.8405027389526367, "num_tokens": 68759515.0, "step": 1799 }, { "epoch": 0.2289785014629182, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.559406280517578, "learning_rate": 7.626112759643916e-07, "loss": 0.5288, "mean_token_accuracy": 0.8360630869865417, "num_tokens": 68796969.0, "step": 1800 }, { "epoch": 0.2291057117415087, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.432390213012695, "learning_rate": 7.630351844001696e-07, "loss": 0.5101, "mean_token_accuracy": 0.8402526378631592, "num_tokens": 68831138.0, "step": 1801 }, { "epoch": 0.22923292202009923, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.028606414794922, "learning_rate": 7.634590928359474e-07, "loss": 0.5015, "mean_token_accuracy": 0.8464771509170532, "num_tokens": 68868741.0, "step": 1802 }, { "epoch": 0.22936013229868973, "ewc_loss": 0.0269775390625, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.217260360717773, "learning_rate": 7.638830012717253e-07, "loss": 0.5188, "mean_token_accuracy": 0.8371703028678894, "num_tokens": 68909191.0, "step": 1803 }, { "epoch": 0.22948734257728023, "ewc_loss": 0.027099609375, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 19.553314208984375, "learning_rate": 7.643069097075031e-07, "loss": 0.4945, "mean_token_accuracy": 0.8462690114974976, "num_tokens": 68949261.0, "step": 1804 }, { "epoch": 0.22961455285587076, "ewc_loss": 0.027099609375, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 18.958240509033203, "learning_rate": 7.64730818143281e-07, "loss": 0.4693, "mean_token_accuracy": 0.8500484228134155, "num_tokens": 68990434.0, "step": 1805 }, { "epoch": 0.22974176313446126, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.554452896118164, "learning_rate": 7.651547265790589e-07, "loss": 0.5529, "mean_token_accuracy": 0.8302739858627319, "num_tokens": 69029482.0, "step": 1806 }, { "epoch": 0.22986897341305176, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.570661544799805, "learning_rate": 7.655786350148368e-07, "loss": 0.5326, "mean_token_accuracy": 0.8350025415420532, "num_tokens": 69069766.0, "step": 1807 }, { "epoch": 0.2299961836916423, "ewc_loss": 0.0272216796875, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 18.937753677368164, "learning_rate": 7.660025434506146e-07, "loss": 0.5184, "mean_token_accuracy": 0.8398247957229614, "num_tokens": 69114592.0, "step": 1808 }, { "epoch": 0.2301233939702328, "ewc_loss": 0.027099609375, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 19.4443416595459, "learning_rate": 7.664264518863926e-07, "loss": 0.507, "mean_token_accuracy": 0.8435121774673462, "num_tokens": 69152599.0, "step": 1809 }, { "epoch": 0.2302506042488233, "ewc_loss": 0.027099609375, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 2.014636993408203e-05, "grad_norm": 19.049537658691406, "learning_rate": 7.668503603221704e-07, "loss": 0.5597, "mean_token_accuracy": 0.823691725730896, "num_tokens": 69193157.0, "step": 1810 }, { "epoch": 0.23037781452741382, "ewc_loss": 0.02734375, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.34761619567871, "learning_rate": 7.672742687579483e-07, "loss": 0.4727, "mean_token_accuracy": 0.8512747883796692, "num_tokens": 69230681.0, "step": 1811 }, { "epoch": 0.23050502480600432, "ewc_loss": 0.027099609375, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.002716064453125e-05, "grad_norm": 19.33704376220703, "learning_rate": 7.676981771937261e-07, "loss": 0.4486, "mean_token_accuracy": 0.8562577366828918, "num_tokens": 69261292.0, "step": 1812 }, { "epoch": 0.23063223508459482, "ewc_loss": 0.0274658203125, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0384788513183594e-05, "grad_norm": 19.168317794799805, "learning_rate": 7.681220856295039e-07, "loss": 0.4816, "mean_token_accuracy": 0.8507214784622192, "num_tokens": 69301795.0, "step": 1813 }, { "epoch": 0.23075944536318535, "ewc_loss": 0.02734375, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0265579223632812e-05, "grad_norm": 19.212623596191406, "learning_rate": 7.685459940652819e-07, "loss": 0.5042, "mean_token_accuracy": 0.8412865400314331, "num_tokens": 69337038.0, "step": 1814 }, { "epoch": 0.23088665564177585, "ewc_loss": 0.0274658203125, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0384788513183594e-05, "grad_norm": 19.434494018554688, "learning_rate": 7.689699025010597e-07, "loss": 0.5026, "mean_token_accuracy": 0.8426297903060913, "num_tokens": 69379871.0, "step": 1815 }, { "epoch": 0.23101386592036638, "ewc_loss": 0.027587890625, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0503997802734375e-05, "grad_norm": 19.37432098388672, "learning_rate": 7.693938109368376e-07, "loss": 0.524, "mean_token_accuracy": 0.8357272148132324, "num_tokens": 69418021.0, "step": 1816 }, { "epoch": 0.23114107619895688, "ewc_loss": 0.0274658203125, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0384788513183594e-05, "grad_norm": 19.080995559692383, "learning_rate": 7.698177193726155e-07, "loss": 0.4998, "mean_token_accuracy": 0.8435379266738892, "num_tokens": 69455574.0, "step": 1817 }, { "epoch": 0.23126828647754738, "ewc_loss": 0.0277099609375, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0623207092285156e-05, "grad_norm": 19.598491668701172, "learning_rate": 7.702416278083933e-07, "loss": 0.5256, "mean_token_accuracy": 0.8374428153038025, "num_tokens": 69489822.0, "step": 1818 }, { "epoch": 0.2313954967561379, "ewc_loss": 0.027587890625, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 2.0503997802734375e-05, "grad_norm": 18.947250366210938, "learning_rate": 7.706655362441712e-07, "loss": 0.4967, "mean_token_accuracy": 0.8446359634399414, "num_tokens": 69532821.0, "step": 1819 }, { "epoch": 0.2315227070347284, "ewc_loss": 0.027587890625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.0503997802734375e-05, "grad_norm": 19.280271530151367, "learning_rate": 7.710894446799491e-07, "loss": 0.4795, "mean_token_accuracy": 0.8482221364974976, "num_tokens": 69565674.0, "step": 1820 }, { "epoch": 0.2316499173133189, "ewc_loss": 0.0279541015625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.086162567138672e-05, "grad_norm": 19.26032257080078, "learning_rate": 7.715133531157269e-07, "loss": 0.4789, "mean_token_accuracy": 0.8536186218261719, "num_tokens": 69606118.0, "step": 1821 }, { "epoch": 0.23177712759190944, "ewc_loss": 0.0279541015625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.086162567138672e-05, "grad_norm": 19.315576553344727, "learning_rate": 7.719372615515049e-07, "loss": 0.5525, "mean_token_accuracy": 0.8308431506156921, "num_tokens": 69650102.0, "step": 1822 }, { "epoch": 0.23190433787049994, "ewc_loss": 0.0279541015625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.086162567138672e-05, "grad_norm": 19.36249351501465, "learning_rate": 7.723611699872827e-07, "loss": 0.4978, "mean_token_accuracy": 0.844011127948761, "num_tokens": 69693440.0, "step": 1823 }, { "epoch": 0.23203154814909044, "ewc_loss": 0.0279541015625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.086162567138672e-05, "grad_norm": 19.358808517456055, "learning_rate": 7.727850784230606e-07, "loss": 0.5162, "mean_token_accuracy": 0.8363584280014038, "num_tokens": 69732651.0, "step": 1824 }, { "epoch": 0.23215875842768097, "ewc_loss": 0.0277099609375, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.0623207092285156e-05, "grad_norm": 19.22662925720215, "learning_rate": 7.732089868588385e-07, "loss": 0.5292, "mean_token_accuracy": 0.8395233750343323, "num_tokens": 69769915.0, "step": 1825 }, { "epoch": 0.23228596870627147, "ewc_loss": 0.02783203125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.0742416381835938e-05, "grad_norm": 19.327239990234375, "learning_rate": 7.736328952946163e-07, "loss": 0.4515, "mean_token_accuracy": 0.8601293563842773, "num_tokens": 69813943.0, "step": 1826 }, { "epoch": 0.23241317898486197, "ewc_loss": 0.028076171875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.09808349609375e-05, "grad_norm": 19.14375877380371, "learning_rate": 7.740568037303942e-07, "loss": 0.4825, "mean_token_accuracy": 0.8475561141967773, "num_tokens": 69847039.0, "step": 1827 }, { "epoch": 0.2325403892634525, "ewc_loss": 0.0281982421875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.110004425048828e-05, "grad_norm": 19.449405670166016, "learning_rate": 7.744807121661721e-07, "loss": 0.4485, "mean_token_accuracy": 0.8619186282157898, "num_tokens": 69884588.0, "step": 1828 }, { "epoch": 0.232667599542043, "ewc_loss": 0.028076171875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.09808349609375e-05, "grad_norm": 19.255905151367188, "learning_rate": 7.749046206019499e-07, "loss": 0.5125, "mean_token_accuracy": 0.8414310216903687, "num_tokens": 69925552.0, "step": 1829 }, { "epoch": 0.2327948098206335, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.474828720092773, "learning_rate": 7.753285290377279e-07, "loss": 0.4449, "mean_token_accuracy": 0.8597263097763062, "num_tokens": 69961776.0, "step": 1830 }, { "epoch": 0.23292202009922403, "ewc_loss": 0.028076171875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.09808349609375e-05, "grad_norm": 19.28544807434082, "learning_rate": 7.757524374735057e-07, "loss": 0.5505, "mean_token_accuracy": 0.8294004797935486, "num_tokens": 69999775.0, "step": 1831 }, { "epoch": 0.23304923037781453, "ewc_loss": 0.028076171875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.09808349609375e-05, "grad_norm": 19.522615432739258, "learning_rate": 7.761763459092836e-07, "loss": 0.4622, "mean_token_accuracy": 0.8583171963691711, "num_tokens": 70037294.0, "step": 1832 }, { "epoch": 0.23317644065640503, "ewc_loss": 0.0279541015625, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.086162567138672e-05, "grad_norm": 19.248502731323242, "learning_rate": 7.766002543450614e-07, "loss": 0.4989, "mean_token_accuracy": 0.8454045653343201, "num_tokens": 70075014.0, "step": 1833 }, { "epoch": 0.23330365093499555, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.347240447998047, "learning_rate": 7.770241627808392e-07, "loss": 0.454, "mean_token_accuracy": 0.8590739965438843, "num_tokens": 70112186.0, "step": 1834 }, { "epoch": 0.23343086121358606, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.462997436523438, "learning_rate": 7.774480712166172e-07, "loss": 0.5341, "mean_token_accuracy": 0.8345950841903687, "num_tokens": 70153285.0, "step": 1835 }, { "epoch": 0.23355807149217656, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.357521057128906, "learning_rate": 7.77871979652395e-07, "loss": 0.5159, "mean_token_accuracy": 0.8407384157180786, "num_tokens": 70193382.0, "step": 1836 }, { "epoch": 0.23368528177076708, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.48033905029297, "learning_rate": 7.782958880881729e-07, "loss": 0.5058, "mean_token_accuracy": 0.8431004881858826, "num_tokens": 70227279.0, "step": 1837 }, { "epoch": 0.23381249204935758, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.268007278442383, "learning_rate": 7.787197965239508e-07, "loss": 0.4662, "mean_token_accuracy": 0.8524416089057922, "num_tokens": 70264395.0, "step": 1838 }, { "epoch": 0.23393970232794808, "ewc_loss": 0.0281982421875, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.110004425048828e-05, "grad_norm": 19.405611038208008, "learning_rate": 7.791437049597287e-07, "loss": 0.5841, "mean_token_accuracy": 0.8206921815872192, "num_tokens": 70295016.0, "step": 1839 }, { "epoch": 0.2340669126065386, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.29908561706543, "learning_rate": 7.795676133955065e-07, "loss": 0.5282, "mean_token_accuracy": 0.8365146517753601, "num_tokens": 70332774.0, "step": 1840 }, { "epoch": 0.2341941228851291, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.342599868774414, "learning_rate": 7.799915218312844e-07, "loss": 0.5112, "mean_token_accuracy": 0.8408956527709961, "num_tokens": 70366880.0, "step": 1841 }, { "epoch": 0.23432133316371964, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.495393753051758, "learning_rate": 7.804154302670622e-07, "loss": 0.4868, "mean_token_accuracy": 0.8489843606948853, "num_tokens": 70405235.0, "step": 1842 }, { "epoch": 0.23444854344231014, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.240869522094727, "learning_rate": 7.808393387028402e-07, "loss": 0.4974, "mean_token_accuracy": 0.8449556827545166, "num_tokens": 70446503.0, "step": 1843 }, { "epoch": 0.23457575372090064, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.546463012695312, "learning_rate": 7.81263247138618e-07, "loss": 0.5483, "mean_token_accuracy": 0.8296930193901062, "num_tokens": 70483514.0, "step": 1844 }, { "epoch": 0.23470296399949117, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.394153594970703, "learning_rate": 7.816871555743959e-07, "loss": 0.5092, "mean_token_accuracy": 0.841485321521759, "num_tokens": 70520944.0, "step": 1845 }, { "epoch": 0.23483017427808167, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.60542869567871, "learning_rate": 7.821110640101738e-07, "loss": 0.5316, "mean_token_accuracy": 0.8337016105651855, "num_tokens": 70564399.0, "step": 1846 }, { "epoch": 0.23495738455667217, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.55291175842285, "learning_rate": 7.825349724459517e-07, "loss": 0.505, "mean_token_accuracy": 0.8440912365913391, "num_tokens": 70604000.0, "step": 1847 }, { "epoch": 0.2350845948352627, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.46872329711914, "learning_rate": 7.829588808817294e-07, "loss": 0.4973, "mean_token_accuracy": 0.8452077507972717, "num_tokens": 70645133.0, "step": 1848 }, { "epoch": 0.2352118051138532, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.53035545349121, "learning_rate": 7.833827893175074e-07, "loss": 0.5325, "mean_token_accuracy": 0.8303597569465637, "num_tokens": 70675824.0, "step": 1849 }, { "epoch": 0.2353390153924437, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.459293365478516, "learning_rate": 7.838066977532852e-07, "loss": 0.4949, "mean_token_accuracy": 0.8467244505882263, "num_tokens": 70714103.0, "step": 1850 }, { "epoch": 0.23546622567103423, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.302886962890625, "learning_rate": 7.842306061890632e-07, "loss": 0.546, "mean_token_accuracy": 0.8328005075454712, "num_tokens": 70754265.0, "step": 1851 }, { "epoch": 0.23559343594962473, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.747671127319336, "learning_rate": 7.84654514624841e-07, "loss": 0.5108, "mean_token_accuracy": 0.8393158316612244, "num_tokens": 70791368.0, "step": 1852 }, { "epoch": 0.23572064622821523, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.292448043823242, "learning_rate": 7.850784230606188e-07, "loss": 0.5578, "mean_token_accuracy": 0.8305296897888184, "num_tokens": 70827397.0, "step": 1853 }, { "epoch": 0.23584785650680576, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.65032386779785, "learning_rate": 7.855023314963968e-07, "loss": 0.5285, "mean_token_accuracy": 0.8398022055625916, "num_tokens": 70872503.0, "step": 1854 }, { "epoch": 0.23597506678539626, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.30231475830078, "learning_rate": 7.859262399321746e-07, "loss": 0.5269, "mean_token_accuracy": 0.8354923725128174, "num_tokens": 70914376.0, "step": 1855 }, { "epoch": 0.23610227706398676, "ewc_loss": 0.0284423828125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.873844146728516, "learning_rate": 7.863501483679524e-07, "loss": 0.4831, "mean_token_accuracy": 0.8495466709136963, "num_tokens": 70954004.0, "step": 1856 }, { "epoch": 0.2362294873425773, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.495975494384766, "learning_rate": 7.867740568037303e-07, "loss": 0.4845, "mean_token_accuracy": 0.8460550904273987, "num_tokens": 70991765.0, "step": 1857 }, { "epoch": 0.2363566976211678, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1338462829589844e-05, "grad_norm": 19.642314910888672, "learning_rate": 7.871979652395082e-07, "loss": 0.5139, "mean_token_accuracy": 0.836925208568573, "num_tokens": 71025478.0, "step": 1858 }, { "epoch": 0.2364839078997583, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.76227378845215, "learning_rate": 7.876218736752861e-07, "loss": 0.4684, "mean_token_accuracy": 0.8562345504760742, "num_tokens": 71067287.0, "step": 1859 }, { "epoch": 0.23661111817834882, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.4331111907959, "learning_rate": 7.88045782111064e-07, "loss": 0.5388, "mean_token_accuracy": 0.8343977928161621, "num_tokens": 71108095.0, "step": 1860 }, { "epoch": 0.23673832845693932, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.71929931640625, "learning_rate": 7.884696905468418e-07, "loss": 0.471, "mean_token_accuracy": 0.8552627563476562, "num_tokens": 71150297.0, "step": 1861 }, { "epoch": 0.23686553873552982, "ewc_loss": 0.0283203125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1219253540039062e-05, "grad_norm": 19.410476684570312, "learning_rate": 7.888935989826198e-07, "loss": 0.5181, "mean_token_accuracy": 0.8363149166107178, "num_tokens": 71189205.0, "step": 1862 }, { "epoch": 0.23699274901412035, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1696090698242188e-05, "grad_norm": 19.58627700805664, "learning_rate": 7.893175074183976e-07, "loss": 0.5088, "mean_token_accuracy": 0.8412683010101318, "num_tokens": 71228141.0, "step": 1863 }, { "epoch": 0.23711995929271085, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.5924129486084, "learning_rate": 7.897414158541754e-07, "loss": 0.5093, "mean_token_accuracy": 0.8429687023162842, "num_tokens": 71267854.0, "step": 1864 }, { "epoch": 0.23724716957130135, "ewc_loss": 0.028564453125, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.77492904663086, "learning_rate": 7.901653242899533e-07, "loss": 0.5444, "mean_token_accuracy": 0.8355633020401001, "num_tokens": 71305030.0, "step": 1865 }, { "epoch": 0.23737437984989188, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.500150680541992, "learning_rate": 7.905892327257312e-07, "loss": 0.5, "mean_token_accuracy": 0.8457883596420288, "num_tokens": 71342202.0, "step": 1866 }, { "epoch": 0.23750159012848238, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.12275505065918e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.716882705688477, "learning_rate": 7.910131411615091e-07, "loss": 0.4991, "mean_token_accuracy": 0.8473154902458191, "num_tokens": 71379702.0, "step": 1867 }, { "epoch": 0.2376288004070729, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.662498474121094, "learning_rate": 7.91437049597287e-07, "loss": 0.5026, "mean_token_accuracy": 0.8441023826599121, "num_tokens": 71417785.0, "step": 1868 }, { "epoch": 0.2377560106856634, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.504526138305664, "learning_rate": 7.918609580330648e-07, "loss": 0.5456, "mean_token_accuracy": 0.8315529823303223, "num_tokens": 71461358.0, "step": 1869 }, { "epoch": 0.2378832209642539, "ewc_loss": 0.0286865234375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1457672119140625e-05, "grad_norm": 19.547916412353516, "learning_rate": 7.922848664688428e-07, "loss": 0.5428, "mean_token_accuracy": 0.8350273966789246, "num_tokens": 71500059.0, "step": 1870 }, { "epoch": 0.23801043124284443, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.572589874267578, "learning_rate": 7.927087749046205e-07, "loss": 0.453, "mean_token_accuracy": 0.8602997064590454, "num_tokens": 71537043.0, "step": 1871 }, { "epoch": 0.23813764152143493, "ewc_loss": 0.029052734375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.181529998779297e-05, "grad_norm": 19.60521697998047, "learning_rate": 7.931326833403983e-07, "loss": 0.4883, "mean_token_accuracy": 0.8463281393051147, "num_tokens": 71570671.0, "step": 1872 }, { "epoch": 0.23826485180002543, "ewc_loss": 0.0289306640625, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1696090698242188e-05, "grad_norm": 19.856555938720703, "learning_rate": 7.935565917761763e-07, "loss": 0.4996, "mean_token_accuracy": 0.8461012840270996, "num_tokens": 71614813.0, "step": 1873 }, { "epoch": 0.23839206207861596, "ewc_loss": 0.0289306640625, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1696090698242188e-05, "grad_norm": 19.7613468170166, "learning_rate": 7.939805002119541e-07, "loss": 0.5146, "mean_token_accuracy": 0.8380506038665771, "num_tokens": 71650441.0, "step": 1874 }, { "epoch": 0.23851927235720646, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1696090698242188e-05, "grad_norm": 19.605316162109375, "learning_rate": 7.944044086477321e-07, "loss": 0.5197, "mean_token_accuracy": 0.8401070833206177, "num_tokens": 71690425.0, "step": 1875 }, { "epoch": 0.23864648263579696, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.804624557495117, "learning_rate": 7.948283170835099e-07, "loss": 0.4682, "mean_token_accuracy": 0.8541742563247681, "num_tokens": 71727349.0, "step": 1876 }, { "epoch": 0.2387736929143875, "ewc_loss": 0.029052734375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.181529998779297e-05, "grad_norm": 19.754343032836914, "learning_rate": 7.952522255192878e-07, "loss": 0.4324, "mean_token_accuracy": 0.8660562038421631, "num_tokens": 71770386.0, "step": 1877 }, { "epoch": 0.238900903192978, "ewc_loss": 0.02880859375, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.1576881408691406e-05, "grad_norm": 19.676218032836914, "learning_rate": 7.956761339550657e-07, "loss": 0.4668, "mean_token_accuracy": 0.8552314043045044, "num_tokens": 71808946.0, "step": 1878 }, { "epoch": 0.2390281134715685, "ewc_loss": 0.029052734375, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 2.181529998779297e-05, "grad_norm": 19.623538970947266, "learning_rate": 7.961000423908435e-07, "loss": 0.5066, "mean_token_accuracy": 0.8494614958763123, "num_tokens": 71848605.0, "step": 1879 }, { "epoch": 0.23915532375015902, "ewc_loss": 0.029052734375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.181529998779297e-05, "grad_norm": 19.83510971069336, "learning_rate": 7.965239508266214e-07, "loss": 0.486, "mean_token_accuracy": 0.8477246761322021, "num_tokens": 71886771.0, "step": 1880 }, { "epoch": 0.23928253402874952, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 19.698583602905273, "learning_rate": 7.969478592623993e-07, "loss": 0.4627, "mean_token_accuracy": 0.8543174862861633, "num_tokens": 71924905.0, "step": 1881 }, { "epoch": 0.23940974430734002, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 20.147472381591797, "learning_rate": 7.973717676981771e-07, "loss": 0.4883, "mean_token_accuracy": 0.8505087494850159, "num_tokens": 71964513.0, "step": 1882 }, { "epoch": 0.23953695458593055, "ewc_loss": 0.0289306640625, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.1696090698242188e-05, "grad_norm": 19.54518699645996, "learning_rate": 7.977956761339551e-07, "loss": 0.4581, "mean_token_accuracy": 0.8572336435317993, "num_tokens": 72000918.0, "step": 1883 }, { "epoch": 0.23966416486452105, "ewc_loss": 0.0294189453125, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2172927856445312e-05, "grad_norm": 20.053050994873047, "learning_rate": 7.982195845697329e-07, "loss": 0.59, "mean_token_accuracy": 0.818673312664032, "num_tokens": 72040840.0, "step": 1884 }, { "epoch": 0.23979137514311155, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 19.551549911499023, "learning_rate": 7.986434930055108e-07, "loss": 0.4613, "mean_token_accuracy": 0.8551055788993835, "num_tokens": 72076757.0, "step": 1885 }, { "epoch": 0.23991858542170208, "ewc_loss": 0.029296875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.205371856689453e-05, "grad_norm": 20.411550521850586, "learning_rate": 7.990674014412886e-07, "loss": 0.52, "mean_token_accuracy": 0.8402745723724365, "num_tokens": 72120760.0, "step": 1886 }, { "epoch": 0.24004579570029258, "ewc_loss": 0.0294189453125, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2172927856445312e-05, "grad_norm": 19.71510887145996, "learning_rate": 7.994913098770665e-07, "loss": 0.5062, "mean_token_accuracy": 0.8411701321601868, "num_tokens": 72159985.0, "step": 1887 }, { "epoch": 0.24017300597888308, "ewc_loss": 0.029296875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.205371856689453e-05, "grad_norm": 20.22582244873047, "learning_rate": 7.999152183128444e-07, "loss": 0.5401, "mean_token_accuracy": 0.8320555686950684, "num_tokens": 72196377.0, "step": 1888 }, { "epoch": 0.2403002162574736, "ewc_loss": 0.029296875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.205371856689453e-05, "grad_norm": 20.1474609375, "learning_rate": 8.003391267486223e-07, "loss": 0.4889, "mean_token_accuracy": 0.8482121825218201, "num_tokens": 72236585.0, "step": 1889 }, { "epoch": 0.2404274265360641, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 19.77404022216797, "learning_rate": 8.007630351844001e-07, "loss": 0.5027, "mean_token_accuracy": 0.8443648815155029, "num_tokens": 72268450.0, "step": 1890 }, { "epoch": 0.24055463681465464, "ewc_loss": 0.029296875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.205371856689453e-05, "grad_norm": 20.074932098388672, "learning_rate": 8.011869436201781e-07, "loss": 0.4846, "mean_token_accuracy": 0.8486583232879639, "num_tokens": 72304092.0, "step": 1891 }, { "epoch": 0.24068184709324514, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 19.606536865234375, "learning_rate": 8.016108520559559e-07, "loss": 0.5121, "mean_token_accuracy": 0.8392413854598999, "num_tokens": 72347312.0, "step": 1892 }, { "epoch": 0.24080905737183564, "ewc_loss": 0.0291748046875, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.193450927734375e-05, "grad_norm": 20.028812408447266, "learning_rate": 8.020347604917338e-07, "loss": 0.4731, "mean_token_accuracy": 0.853114902973175, "num_tokens": 72382135.0, "step": 1893 }, { "epoch": 0.24093626765042617, "ewc_loss": 0.029541015625, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2292137145996094e-05, "grad_norm": 19.632537841796875, "learning_rate": 8.024586689275116e-07, "loss": 0.5022, "mean_token_accuracy": 0.8447593450546265, "num_tokens": 72421526.0, "step": 1894 }, { "epoch": 0.24106347792901667, "ewc_loss": 0.0294189453125, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2172927856445312e-05, "grad_norm": 19.82588768005371, "learning_rate": 8.028825773632894e-07, "loss": 0.5018, "mean_token_accuracy": 0.8463128805160522, "num_tokens": 72458592.0, "step": 1895 }, { "epoch": 0.24119068820760717, "ewc_loss": 0.0296630859375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2411346435546875e-05, "grad_norm": 19.87993812561035, "learning_rate": 8.033064857990674e-07, "loss": 0.4333, "mean_token_accuracy": 0.8651313781738281, "num_tokens": 72494734.0, "step": 1896 }, { "epoch": 0.2413178984861977, "ewc_loss": 0.0296630859375, "ewc_loss_diag": 7.18235969543457e-06, "ewc_loss_parallel": 2.2411346435546875e-05, "grad_norm": 19.728771209716797, "learning_rate": 8.037303942348452e-07, "loss": 0.4836, "mean_token_accuracy": 0.8502976894378662, "num_tokens": 72531560.0, "step": 1897 }, { "epoch": 0.2414451087647882, "ewc_loss": 0.029541015625, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 2.2292137145996094e-05, "grad_norm": 19.858552932739258, "learning_rate": 8.041543026706231e-07, "loss": 0.4811, "mean_token_accuracy": 0.8510342240333557, "num_tokens": 72570144.0, "step": 1898 }, { "epoch": 0.2415723190433787, "ewc_loss": 0.0296630859375, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 2.2411346435546875e-05, "grad_norm": 19.66207504272461, "learning_rate": 8.04578211106401e-07, "loss": 0.4917, "mean_token_accuracy": 0.8449121713638306, "num_tokens": 72606738.0, "step": 1899 }, { "epoch": 0.24169952932196923, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 20.164081573486328, "learning_rate": 8.050021195421789e-07, "loss": 0.5351, "mean_token_accuracy": 0.8347603678703308, "num_tokens": 72646230.0, "step": 1900 }, { "epoch": 0.24182673960055973, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.795665740966797, "learning_rate": 8.054260279779567e-07, "loss": 0.5048, "mean_token_accuracy": 0.8412647843360901, "num_tokens": 72681811.0, "step": 1901 }, { "epoch": 0.24195394987915023, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.151941299438477, "learning_rate": 8.058499364137346e-07, "loss": 0.5405, "mean_token_accuracy": 0.8327749967575073, "num_tokens": 72721805.0, "step": 1902 }, { "epoch": 0.24208116015774075, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.73073959350586, "learning_rate": 8.062738448495124e-07, "loss": 0.4527, "mean_token_accuracy": 0.8583295345306396, "num_tokens": 72763142.0, "step": 1903 }, { "epoch": 0.24220837043633126, "ewc_loss": 0.029541015625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2292137145996094e-05, "grad_norm": 19.826223373413086, "learning_rate": 8.066977532852904e-07, "loss": 0.4448, "mean_token_accuracy": 0.8645913600921631, "num_tokens": 72802047.0, "step": 1904 }, { "epoch": 0.24233558071492176, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.927030563354492, "learning_rate": 8.071216617210682e-07, "loss": 0.5248, "mean_token_accuracy": 0.8372050523757935, "num_tokens": 72834470.0, "step": 1905 }, { "epoch": 0.24246279099351228, "ewc_loss": 0.0302734375, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.288818359375e-05, "grad_norm": 20.17230224609375, "learning_rate": 8.075455701568461e-07, "loss": 0.4933, "mean_token_accuracy": 0.8478735685348511, "num_tokens": 72873101.0, "step": 1906 }, { "epoch": 0.24259000127210278, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.917024612426758, "learning_rate": 8.07969478592624e-07, "loss": 0.4716, "mean_token_accuracy": 0.8571261763572693, "num_tokens": 72915504.0, "step": 1907 }, { "epoch": 0.24271721155069328, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2411346435546875e-05, "grad_norm": 19.82779312133789, "learning_rate": 8.083933870284019e-07, "loss": 0.445, "mean_token_accuracy": 0.8624507784843445, "num_tokens": 72947179.0, "step": 1908 }, { "epoch": 0.2428444218292838, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.060705184936523, "learning_rate": 8.088172954641796e-07, "loss": 0.4873, "mean_token_accuracy": 0.8495006561279297, "num_tokens": 72983313.0, "step": 1909 }, { "epoch": 0.2429716321078743, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 19.995548248291016, "learning_rate": 8.092412038999576e-07, "loss": 0.5386, "mean_token_accuracy": 0.833185076713562, "num_tokens": 73014218.0, "step": 1910 }, { "epoch": 0.24309884238646481, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.19583511352539, "learning_rate": 8.096651123357354e-07, "loss": 0.5475, "mean_token_accuracy": 0.8283640146255493, "num_tokens": 73048282.0, "step": 1911 }, { "epoch": 0.24322605266505534, "ewc_loss": 0.02978515625, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.86252212524414, "learning_rate": 8.100890207715134e-07, "loss": 0.5326, "mean_token_accuracy": 0.8363766670227051, "num_tokens": 73084390.0, "step": 1912 }, { "epoch": 0.24335326294364584, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 19.935035705566406, "learning_rate": 8.105129292072912e-07, "loss": 0.4995, "mean_token_accuracy": 0.845741868019104, "num_tokens": 73123296.0, "step": 1913 }, { "epoch": 0.24348047322223634, "ewc_loss": 0.0299072265625, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 20.07522201538086, "learning_rate": 8.10936837643069e-07, "loss": 0.5464, "mean_token_accuracy": 0.8335976600646973, "num_tokens": 73157533.0, "step": 1914 }, { "epoch": 0.24360768350082687, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 19.760478973388672, "learning_rate": 8.11360746078847e-07, "loss": 0.4778, "mean_token_accuracy": 0.8511161208152771, "num_tokens": 73195314.0, "step": 1915 }, { "epoch": 0.24373489377941737, "ewc_loss": 0.0299072265625, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 20.07062530517578, "learning_rate": 8.117846545146248e-07, "loss": 0.4472, "mean_token_accuracy": 0.8604342937469482, "num_tokens": 73236763.0, "step": 1916 }, { "epoch": 0.2438621040580079, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 19.990819931030273, "learning_rate": 8.122085629504026e-07, "loss": 0.5373, "mean_token_accuracy": 0.8314995765686035, "num_tokens": 73276719.0, "step": 1917 }, { "epoch": 0.2439893143365984, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.08063507080078, "learning_rate": 8.126324713861805e-07, "loss": 0.5023, "mean_token_accuracy": 0.8466892242431641, "num_tokens": 73318594.0, "step": 1918 }, { "epoch": 0.2441165246151889, "ewc_loss": 0.0299072265625, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.94573402404785, "learning_rate": 8.130563798219584e-07, "loss": 0.4755, "mean_token_accuracy": 0.8547689914703369, "num_tokens": 73358760.0, "step": 1919 }, { "epoch": 0.24424373489377943, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.3909759521484375e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.183748245239258, "learning_rate": 8.134802882577363e-07, "loss": 0.5198, "mean_token_accuracy": 0.8440244197845459, "num_tokens": 73398623.0, "step": 1920 }, { "epoch": 0.24437094517236993, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.3909759521484375e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.817354202270508, "learning_rate": 8.139041966935142e-07, "loss": 0.5011, "mean_token_accuracy": 0.8464621305465698, "num_tokens": 73446369.0, "step": 1921 }, { "epoch": 0.24449815545096043, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.110872268676758, "learning_rate": 8.14328105129292e-07, "loss": 0.562, "mean_token_accuracy": 0.8320327997207642, "num_tokens": 73483152.0, "step": 1922 }, { "epoch": 0.24462536572955096, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.200281143188477, "learning_rate": 8.1475201356507e-07, "loss": 0.5168, "mean_token_accuracy": 0.8416966795921326, "num_tokens": 73519179.0, "step": 1923 }, { "epoch": 0.24475257600814146, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 19.780668258666992, "learning_rate": 8.151759220008477e-07, "loss": 0.4881, "mean_token_accuracy": 0.8471894264221191, "num_tokens": 73561947.0, "step": 1924 }, { "epoch": 0.24487978628673196, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.502988815307617, "learning_rate": 8.155998304366256e-07, "loss": 0.5039, "mean_token_accuracy": 0.8492698073387146, "num_tokens": 73595841.0, "step": 1925 }, { "epoch": 0.2450069965653225, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.01350212097168, "learning_rate": 8.160237388724035e-07, "loss": 0.5292, "mean_token_accuracy": 0.8340639472007751, "num_tokens": 73629716.0, "step": 1926 }, { "epoch": 0.245134206843913, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.449989318847656, "learning_rate": 8.164476473081814e-07, "loss": 0.5167, "mean_token_accuracy": 0.8410772085189819, "num_tokens": 73667285.0, "step": 1927 }, { "epoch": 0.2452614171225035, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.210241317749023, "learning_rate": 8.168715557439593e-07, "loss": 0.4605, "mean_token_accuracy": 0.8579760193824768, "num_tokens": 73708059.0, "step": 1928 }, { "epoch": 0.24538862740109402, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.388763427734375, "learning_rate": 8.172954641797372e-07, "loss": 0.4719, "mean_token_accuracy": 0.8519940376281738, "num_tokens": 73748553.0, "step": 1929 }, { "epoch": 0.24551583767968452, "ewc_loss": 0.030029296875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2649765014648438e-05, "grad_norm": 20.186735153198242, "learning_rate": 8.17719372615515e-07, "loss": 0.5733, "mean_token_accuracy": 0.8243513107299805, "num_tokens": 73796295.0, "step": 1930 }, { "epoch": 0.24564304795827502, "ewc_loss": 0.0299072265625, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.2530555725097656e-05, "grad_norm": 19.916194915771484, "learning_rate": 8.18143281051293e-07, "loss": 0.554, "mean_token_accuracy": 0.8310136795043945, "num_tokens": 73835960.0, "step": 1931 }, { "epoch": 0.24577025823686555, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.176607131958008, "learning_rate": 8.185671894870707e-07, "loss": 0.5697, "mean_token_accuracy": 0.8270441889762878, "num_tokens": 73875302.0, "step": 1932 }, { "epoch": 0.24589746851545605, "ewc_loss": 0.0302734375, "ewc_loss_diag": 7.3909759521484375e-06, "ewc_loss_parallel": 2.288818359375e-05, "grad_norm": 20.044294357299805, "learning_rate": 8.189910979228485e-07, "loss": 0.4598, "mean_token_accuracy": 0.8558034896850586, "num_tokens": 73907703.0, "step": 1933 }, { "epoch": 0.24602467879404655, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 19.796035766601562, "learning_rate": 8.194150063586265e-07, "loss": 0.5545, "mean_token_accuracy": 0.8318365216255188, "num_tokens": 73945449.0, "step": 1934 }, { "epoch": 0.24615188907263708, "ewc_loss": 0.0301513671875, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 2.276897430419922e-05, "grad_norm": 20.095457077026367, "learning_rate": 8.198389147944043e-07, "loss": 0.5428, "mean_token_accuracy": 0.8293201923370361, "num_tokens": 73982461.0, "step": 1935 }, { "epoch": 0.24627909935122758, "ewc_loss": 0.030517578125, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3126602172851562e-05, "grad_norm": 20.13409996032715, "learning_rate": 8.202628232301823e-07, "loss": 0.5163, "mean_token_accuracy": 0.845039427280426, "num_tokens": 74018252.0, "step": 1936 }, { "epoch": 0.24640630962981808, "ewc_loss": 0.0303955078125, "ewc_loss_diag": 7.361173629760742e-06, "ewc_loss_parallel": 2.3126602172851562e-05, "grad_norm": 20.16545867919922, "learning_rate": 8.206867316659601e-07, "loss": 0.5357, "mean_token_accuracy": 0.833188533782959, "num_tokens": 74065843.0, "step": 1937 }, { "epoch": 0.2465335199084086, "ewc_loss": 0.0302734375, "ewc_loss_diag": 7.361173629760742e-06, "ewc_loss_parallel": 2.300739288330078e-05, "grad_norm": 20.210464477539062, "learning_rate": 8.21110640101738e-07, "loss": 0.4254, "mean_token_accuracy": 0.8649129271507263, "num_tokens": 74106389.0, "step": 1938 }, { "epoch": 0.2466607301869991, "ewc_loss": 0.0303955078125, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.288818359375e-05, "grad_norm": 19.975664138793945, "learning_rate": 8.215345485375159e-07, "loss": 0.5471, "mean_token_accuracy": 0.8322845101356506, "num_tokens": 74150786.0, "step": 1939 }, { "epoch": 0.2467879404655896, "ewc_loss": 0.0306396484375, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3245811462402344e-05, "grad_norm": 20.317766189575195, "learning_rate": 8.219584569732937e-07, "loss": 0.5168, "mean_token_accuracy": 0.8377938270568848, "num_tokens": 74190474.0, "step": 1940 }, { "epoch": 0.24691515074418013, "ewc_loss": 0.0306396484375, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3245811462402344e-05, "grad_norm": 20.45448875427246, "learning_rate": 8.223823654090715e-07, "loss": 0.4665, "mean_token_accuracy": 0.8558003902435303, "num_tokens": 74226437.0, "step": 1941 }, { "epoch": 0.24704236102277063, "ewc_loss": 0.0303955078125, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.288818359375e-05, "grad_norm": 19.663034439086914, "learning_rate": 8.228062738448495e-07, "loss": 0.4969, "mean_token_accuracy": 0.8423318862915039, "num_tokens": 74267292.0, "step": 1942 }, { "epoch": 0.24716957130136116, "ewc_loss": 0.030517578125, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3126602172851562e-05, "grad_norm": 20.62506866455078, "learning_rate": 8.232301822806273e-07, "loss": 0.5214, "mean_token_accuracy": 0.8391439914703369, "num_tokens": 74306738.0, "step": 1943 }, { "epoch": 0.24729678157995166, "ewc_loss": 0.0306396484375, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3245811462402344e-05, "grad_norm": 19.84124755859375, "learning_rate": 8.236540907164053e-07, "loss": 0.5113, "mean_token_accuracy": 0.8446427583694458, "num_tokens": 74343151.0, "step": 1944 }, { "epoch": 0.24742399185854216, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.24370574951172, "learning_rate": 8.240779991521831e-07, "loss": 0.4876, "mean_token_accuracy": 0.8477693200111389, "num_tokens": 74382755.0, "step": 1945 }, { "epoch": 0.2475512021371327, "ewc_loss": 0.0303955078125, "ewc_loss_diag": 7.4803829193115234e-06, "ewc_loss_parallel": 2.300739288330078e-05, "grad_norm": 20.188905715942383, "learning_rate": 8.24501907587961e-07, "loss": 0.5246, "mean_token_accuracy": 0.838270902633667, "num_tokens": 74426129.0, "step": 1946 }, { "epoch": 0.2476784124157232, "ewc_loss": 0.0308837890625, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 2.3484230041503906e-05, "grad_norm": 20.159748077392578, "learning_rate": 8.249258160237388e-07, "loss": 0.5368, "mean_token_accuracy": 0.839500904083252, "num_tokens": 74460904.0, "step": 1947 }, { "epoch": 0.2478056226943137, "ewc_loss": 0.030517578125, "ewc_loss_diag": 7.4803829193115234e-06, "ewc_loss_parallel": 2.3126602172851562e-05, "grad_norm": 20.189939498901367, "learning_rate": 8.253497244595167e-07, "loss": 0.5249, "mean_token_accuracy": 0.8383264541625977, "num_tokens": 74497993.0, "step": 1948 }, { "epoch": 0.24793283297290422, "ewc_loss": 0.0306396484375, "ewc_loss_diag": 7.4803829193115234e-06, "ewc_loss_parallel": 2.3245811462402344e-05, "grad_norm": 19.92631721496582, "learning_rate": 8.257736328952945e-07, "loss": 0.5413, "mean_token_accuracy": 0.8341482877731323, "num_tokens": 74541108.0, "step": 1949 }, { "epoch": 0.24806004325149472, "ewc_loss": 0.0311279296875, "ewc_loss_diag": 7.4803829193115234e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.299917221069336, "learning_rate": 8.261975413310725e-07, "loss": 0.5125, "mean_token_accuracy": 0.8431094288825989, "num_tokens": 74579389.0, "step": 1950 }, { "epoch": 0.24818725353008522, "ewc_loss": 0.03076171875, "ewc_loss_diag": 7.4803829193115234e-06, "ewc_loss_parallel": 2.3365020751953125e-05, "grad_norm": 19.929033279418945, "learning_rate": 8.266214497668503e-07, "loss": 0.4646, "mean_token_accuracy": 0.8595176935195923, "num_tokens": 74621546.0, "step": 1951 }, { "epoch": 0.24831446380867575, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.242399215698242, "learning_rate": 8.270453582026283e-07, "loss": 0.5474, "mean_token_accuracy": 0.8304349780082703, "num_tokens": 74660178.0, "step": 1952 }, { "epoch": 0.24844167408726625, "ewc_loss": 0.03076171875, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3245811462402344e-05, "grad_norm": 19.96139144897461, "learning_rate": 8.274692666384061e-07, "loss": 0.4919, "mean_token_accuracy": 0.8477873802185059, "num_tokens": 74693867.0, "step": 1953 }, { "epoch": 0.24856888436585675, "ewc_loss": 0.03125, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.139144897460938, "learning_rate": 8.27893175074184e-07, "loss": 0.5074, "mean_token_accuracy": 0.8416606783866882, "num_tokens": 74727561.0, "step": 1954 }, { "epoch": 0.24869609464444728, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.296831130981445, "learning_rate": 8.283170835099618e-07, "loss": 0.5077, "mean_token_accuracy": 0.8454954028129578, "num_tokens": 74763719.0, "step": 1955 }, { "epoch": 0.24882330492303778, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.124813079833984, "learning_rate": 8.287409919457396e-07, "loss": 0.4726, "mean_token_accuracy": 0.8528652191162109, "num_tokens": 74802666.0, "step": 1956 }, { "epoch": 0.24895051520162828, "ewc_loss": 0.03125, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.144832611083984, "learning_rate": 8.291649003815175e-07, "loss": 0.5219, "mean_token_accuracy": 0.8397611975669861, "num_tokens": 74840307.0, "step": 1957 }, { "epoch": 0.2490777254802188, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.24774169921875, "learning_rate": 8.295888088172954e-07, "loss": 0.5327, "mean_token_accuracy": 0.8381834030151367, "num_tokens": 74880181.0, "step": 1958 }, { "epoch": 0.2492049357588093, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3484230041503906e-05, "grad_norm": 20.191116333007812, "learning_rate": 8.300127172530733e-07, "loss": 0.4422, "mean_token_accuracy": 0.8610109090805054, "num_tokens": 74917148.0, "step": 1959 }, { "epoch": 0.2493321460373998, "ewc_loss": 0.03125, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.449779510498047, "learning_rate": 8.304366256888512e-07, "loss": 0.4792, "mean_token_accuracy": 0.8502135276794434, "num_tokens": 74953124.0, "step": 1960 }, { "epoch": 0.24945935631599034, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.32945442199707, "learning_rate": 8.308605341246291e-07, "loss": 0.4931, "mean_token_accuracy": 0.8468635082244873, "num_tokens": 74989726.0, "step": 1961 }, { "epoch": 0.24958656659458084, "ewc_loss": 0.03125, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.23198890686035, "learning_rate": 8.312844425604068e-07, "loss": 0.4947, "mean_token_accuracy": 0.8528809547424316, "num_tokens": 75024849.0, "step": 1962 }, { "epoch": 0.24971377687317134, "ewc_loss": 0.0311279296875, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.35301399230957, "learning_rate": 8.317083509961848e-07, "loss": 0.5228, "mean_token_accuracy": 0.8420448303222656, "num_tokens": 75066464.0, "step": 1963 }, { "epoch": 0.24984098715176187, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.3484230041503906e-05, "grad_norm": 20.188621520996094, "learning_rate": 8.321322594319626e-07, "loss": 0.4739, "mean_token_accuracy": 0.8522279262542725, "num_tokens": 75106829.0, "step": 1964 }, { "epoch": 0.24996819743035237, "ewc_loss": 0.03125, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.793603897094727, "learning_rate": 8.325561678677405e-07, "loss": 0.527, "mean_token_accuracy": 0.8387065529823303, "num_tokens": 75148227.0, "step": 1965 }, { "epoch": 0.2500954077089429, "ewc_loss": 0.0311279296875, "ewc_loss_diag": 7.539987564086914e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.17504119873047, "learning_rate": 8.329800763035184e-07, "loss": 0.5413, "mean_token_accuracy": 0.8329966068267822, "num_tokens": 75188719.0, "step": 1966 }, { "epoch": 0.25022261798753337, "ewc_loss": 0.03125, "ewc_loss_diag": 7.539987564086914e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.200157165527344, "learning_rate": 8.334039847392963e-07, "loss": 0.4917, "mean_token_accuracy": 0.8453341126441956, "num_tokens": 75222260.0, "step": 1967 }, { "epoch": 0.2503498282661239, "ewc_loss": 0.031005859375, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.3484230041503906e-05, "grad_norm": 20.36290168762207, "learning_rate": 8.338278931750742e-07, "loss": 0.4647, "mean_token_accuracy": 0.8560523986816406, "num_tokens": 75255489.0, "step": 1968 }, { "epoch": 0.2504770385447144, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.599592208862305e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.283905029296875, "learning_rate": 8.342518016108521e-07, "loss": 0.4941, "mean_token_accuracy": 0.8441625833511353, "num_tokens": 75290548.0, "step": 1969 }, { "epoch": 0.2506042488233049, "ewc_loss": 0.0311279296875, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.3484230041503906e-05, "grad_norm": 20.388731002807617, "learning_rate": 8.346757100466298e-07, "loss": 0.4933, "mean_token_accuracy": 0.8477362394332886, "num_tokens": 75323912.0, "step": 1970 }, { "epoch": 0.2507314591018954, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.364917755126953, "learning_rate": 8.350996184824078e-07, "loss": 0.5348, "mean_token_accuracy": 0.8339495658874512, "num_tokens": 75361206.0, "step": 1971 }, { "epoch": 0.25085866938048595, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.424713134765625, "learning_rate": 8.355235269181856e-07, "loss": 0.5166, "mean_token_accuracy": 0.840181827545166, "num_tokens": 75397292.0, "step": 1972 }, { "epoch": 0.2509858796590764, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.458236694335938, "learning_rate": 8.359474353539635e-07, "loss": 0.4717, "mean_token_accuracy": 0.8547057509422302, "num_tokens": 75437616.0, "step": 1973 }, { "epoch": 0.25111308993766696, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.713930130004883, "learning_rate": 8.363713437897414e-07, "loss": 0.5656, "mean_token_accuracy": 0.8275768160820007, "num_tokens": 75486172.0, "step": 1974 }, { "epoch": 0.2512403002162575, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.575559616088867, "learning_rate": 8.367952522255193e-07, "loss": 0.4993, "mean_token_accuracy": 0.841789722442627, "num_tokens": 75526637.0, "step": 1975 }, { "epoch": 0.25136751049484796, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.323102951049805, "learning_rate": 8.372191606612972e-07, "loss": 0.5003, "mean_token_accuracy": 0.8463973999023438, "num_tokens": 75564632.0, "step": 1976 }, { "epoch": 0.2514947207734385, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.70505142211914, "learning_rate": 8.376430690970749e-07, "loss": 0.4674, "mean_token_accuracy": 0.8565988540649414, "num_tokens": 75595553.0, "step": 1977 }, { "epoch": 0.251621931052029, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.48225212097168, "learning_rate": 8.380669775328528e-07, "loss": 0.462, "mean_token_accuracy": 0.8579760789871216, "num_tokens": 75635847.0, "step": 1978 }, { "epoch": 0.25174914133061954, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.609582901000977, "learning_rate": 8.384908859686307e-07, "loss": 0.46, "mean_token_accuracy": 0.8586071729660034, "num_tokens": 75677403.0, "step": 1979 }, { "epoch": 0.25187635160921, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.415721893310547, "learning_rate": 8.389147944044086e-07, "loss": 0.5031, "mean_token_accuracy": 0.8428777456283569, "num_tokens": 75720703.0, "step": 1980 }, { "epoch": 0.25200356188780054, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.562978744506836, "learning_rate": 8.393387028401864e-07, "loss": 0.5258, "mean_token_accuracy": 0.8379553556442261, "num_tokens": 75758000.0, "step": 1981 }, { "epoch": 0.25213077216639107, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.6470890045166, "learning_rate": 8.397626112759644e-07, "loss": 0.5548, "mean_token_accuracy": 0.8310660719871521, "num_tokens": 75791113.0, "step": 1982 }, { "epoch": 0.25225798244498154, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.62605094909668, "learning_rate": 8.401865197117422e-07, "loss": 0.5001, "mean_token_accuracy": 0.8466225266456604, "num_tokens": 75821733.0, "step": 1983 }, { "epoch": 0.25238519272357207, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.775766372680664, "learning_rate": 8.406104281475202e-07, "loss": 0.4765, "mean_token_accuracy": 0.8507956862449646, "num_tokens": 75861404.0, "step": 1984 }, { "epoch": 0.2525124030021626, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.451095581054688, "learning_rate": 8.410343365832979e-07, "loss": 0.4755, "mean_token_accuracy": 0.8510605096817017, "num_tokens": 75903059.0, "step": 1985 }, { "epoch": 0.2526396132807531, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.5142879486084, "learning_rate": 8.414582450190758e-07, "loss": 0.4892, "mean_token_accuracy": 0.848701000213623, "num_tokens": 75947125.0, "step": 1986 }, { "epoch": 0.2527668235593436, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 21.09033203125, "learning_rate": 8.418821534548537e-07, "loss": 0.5752, "mean_token_accuracy": 0.8245248198509216, "num_tokens": 75987519.0, "step": 1987 }, { "epoch": 0.25289403383793413, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.523832321166992, "learning_rate": 8.423060618906316e-07, "loss": 0.502, "mean_token_accuracy": 0.8454207181930542, "num_tokens": 76027728.0, "step": 1988 }, { "epoch": 0.2530212441165246, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.998506546020508, "learning_rate": 8.427299703264095e-07, "loss": 0.4996, "mean_token_accuracy": 0.8487264513969421, "num_tokens": 76070942.0, "step": 1989 }, { "epoch": 0.25314845439511513, "ewc_loss": 0.03125, "ewc_loss_diag": 7.68899917602539e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 21.31647491455078, "learning_rate": 8.431538787621874e-07, "loss": 0.5266, "mean_token_accuracy": 0.8396706581115723, "num_tokens": 76113330.0, "step": 1990 }, { "epoch": 0.25327566467370566, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.370325088500977, "learning_rate": 8.435777871979652e-07, "loss": 0.4769, "mean_token_accuracy": 0.8534902334213257, "num_tokens": 76153511.0, "step": 1991 }, { "epoch": 0.25340287495229613, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 21.268278121948242, "learning_rate": 8.440016956337432e-07, "loss": 0.5039, "mean_token_accuracy": 0.8461060523986816, "num_tokens": 76191505.0, "step": 1992 }, { "epoch": 0.25353008523088666, "ewc_loss": 0.03125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.3603439331054688e-05, "grad_norm": 20.49579620361328, "learning_rate": 8.444256040695209e-07, "loss": 0.5188, "mean_token_accuracy": 0.8426104784011841, "num_tokens": 76232479.0, "step": 1993 }, { "epoch": 0.2536572955094772, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.867521286010742, "learning_rate": 8.448495125052988e-07, "loss": 0.514, "mean_token_accuracy": 0.842623233795166, "num_tokens": 76270192.0, "step": 1994 }, { "epoch": 0.25378450578806766, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.9370174407959, "learning_rate": 8.452734209410767e-07, "loss": 0.4529, "mean_token_accuracy": 0.860924482345581, "num_tokens": 76303981.0, "step": 1995 }, { "epoch": 0.2539117160666582, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.544723510742188, "learning_rate": 8.456973293768545e-07, "loss": 0.4784, "mean_token_accuracy": 0.850900411605835, "num_tokens": 76338496.0, "step": 1996 }, { "epoch": 0.2540389263452487, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.62777328491211, "learning_rate": 8.461212378126325e-07, "loss": 0.5319, "mean_token_accuracy": 0.8367282748222351, "num_tokens": 76377904.0, "step": 1997 }, { "epoch": 0.2541661366238392, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.782869338989258, "learning_rate": 8.465451462484103e-07, "loss": 0.5555, "mean_token_accuracy": 0.8309597969055176, "num_tokens": 76413905.0, "step": 1998 }, { "epoch": 0.2542933469024297, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.639684677124023, "learning_rate": 8.469690546841882e-07, "loss": 0.5054, "mean_token_accuracy": 0.8441073894500732, "num_tokens": 76451591.0, "step": 1999 }, { "epoch": 0.25442055718102025, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.884197235107422, "learning_rate": 8.47392963119966e-07, "loss": 0.4856, "mean_token_accuracy": 0.8493395447731018, "num_tokens": 76486191.0, "step": 2000 }, { "epoch": 0.2545477674596107, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.384185791015625e-05, "grad_norm": 20.612125396728516, "learning_rate": 8.478168715557439e-07, "loss": 0.5034, "mean_token_accuracy": 0.8479421138763428, "num_tokens": 76523317.0, "step": 2001 }, { "epoch": 0.25467497773820125, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.77923011779785, "learning_rate": 8.482407799915217e-07, "loss": 0.5108, "mean_token_accuracy": 0.8425862789154053, "num_tokens": 76562661.0, "step": 2002 }, { "epoch": 0.2548021880167918, "ewc_loss": 0.031494140625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.372264862060547e-05, "grad_norm": 20.57289695739746, "learning_rate": 8.486646884272997e-07, "loss": 0.5048, "mean_token_accuracy": 0.8419800400733948, "num_tokens": 76600914.0, "step": 2003 }, { "epoch": 0.25492939829538225, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.84684181213379, "learning_rate": 8.490885968630775e-07, "loss": 0.5252, "mean_token_accuracy": 0.8366991877555847, "num_tokens": 76636310.0, "step": 2004 }, { "epoch": 0.2550566085739728, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.518587112426758, "learning_rate": 8.495125052988555e-07, "loss": 0.5088, "mean_token_accuracy": 0.8464745879173279, "num_tokens": 76680889.0, "step": 2005 }, { "epoch": 0.2551838188525633, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.936410903930664, "learning_rate": 8.499364137346333e-07, "loss": 0.5022, "mean_token_accuracy": 0.8501507639884949, "num_tokens": 76722469.0, "step": 2006 }, { "epoch": 0.2553110291311538, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.582448959350586, "learning_rate": 8.503603221704112e-07, "loss": 0.5389, "mean_token_accuracy": 0.834929347038269, "num_tokens": 76757668.0, "step": 2007 }, { "epoch": 0.2554382394097443, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.833005905151367, "learning_rate": 8.50784230606189e-07, "loss": 0.4987, "mean_token_accuracy": 0.8475079536437988, "num_tokens": 76795607.0, "step": 2008 }, { "epoch": 0.25556544968833483, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.59632682800293, "learning_rate": 8.512081390419669e-07, "loss": 0.5277, "mean_token_accuracy": 0.8445346355438232, "num_tokens": 76837161.0, "step": 2009 }, { "epoch": 0.2556926599669253, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 20.768566131591797, "learning_rate": 8.516320474777447e-07, "loss": 0.5051, "mean_token_accuracy": 0.8446636199951172, "num_tokens": 76872198.0, "step": 2010 }, { "epoch": 0.25581987024551583, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 20.765151977539062, "learning_rate": 8.520559559135227e-07, "loss": 0.472, "mean_token_accuracy": 0.8572216629981995, "num_tokens": 76908976.0, "step": 2011 }, { "epoch": 0.25594708052410636, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 21.039337158203125, "learning_rate": 8.524798643493005e-07, "loss": 0.524, "mean_token_accuracy": 0.8439326286315918, "num_tokens": 76947298.0, "step": 2012 }, { "epoch": 0.25607429080269684, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.747039794921875, "learning_rate": 8.529037727850785e-07, "loss": 0.4786, "mean_token_accuracy": 0.8509160280227661, "num_tokens": 76985112.0, "step": 2013 }, { "epoch": 0.25620150108128736, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 21.436813354492188, "learning_rate": 8.533276812208563e-07, "loss": 0.4517, "mean_token_accuracy": 0.8600617051124573, "num_tokens": 77020309.0, "step": 2014 }, { "epoch": 0.2563287113598779, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.761810302734375, "learning_rate": 8.53751589656634e-07, "loss": 0.4928, "mean_token_accuracy": 0.8458610773086548, "num_tokens": 77063245.0, "step": 2015 }, { "epoch": 0.25645592163846836, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4557113647460938e-05, "grad_norm": 20.90212631225586, "learning_rate": 8.54175498092412e-07, "loss": 0.49, "mean_token_accuracy": 0.8491048812866211, "num_tokens": 77096791.0, "step": 2016 }, { "epoch": 0.2565831319170589, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4199485778808594e-05, "grad_norm": 21.1782169342041, "learning_rate": 8.545994065281898e-07, "loss": 0.4664, "mean_token_accuracy": 0.8564184904098511, "num_tokens": 77129281.0, "step": 2017 }, { "epoch": 0.2567103421956494, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.67928123474121, "learning_rate": 8.550233149639677e-07, "loss": 0.5563, "mean_token_accuracy": 0.8275189399719238, "num_tokens": 77168785.0, "step": 2018 }, { "epoch": 0.2568375524742399, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 20.824092864990234, "learning_rate": 8.554472233997456e-07, "loss": 0.5237, "mean_token_accuracy": 0.8355070948600769, "num_tokens": 77208360.0, "step": 2019 }, { "epoch": 0.2569647627528304, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 21.341320037841797, "learning_rate": 8.558711318355235e-07, "loss": 0.5563, "mean_token_accuracy": 0.8275998830795288, "num_tokens": 77245539.0, "step": 2020 }, { "epoch": 0.25709197303142095, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4199485778808594e-05, "grad_norm": 20.693336486816406, "learning_rate": 8.562950402713014e-07, "loss": 0.4583, "mean_token_accuracy": 0.863017737865448, "num_tokens": 77279118.0, "step": 2021 }, { "epoch": 0.2572191833100114, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4557113647460938e-05, "grad_norm": 21.771846771240234, "learning_rate": 8.567189487070793e-07, "loss": 0.5472, "mean_token_accuracy": 0.8341283202171326, "num_tokens": 77316259.0, "step": 2022 }, { "epoch": 0.25734639358860195, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.685039520263672, "learning_rate": 8.57142857142857e-07, "loss": 0.5661, "mean_token_accuracy": 0.8281040787696838, "num_tokens": 77359712.0, "step": 2023 }, { "epoch": 0.2574736038671925, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 21.300880432128906, "learning_rate": 8.57566765578635e-07, "loss": 0.4664, "mean_token_accuracy": 0.8592913150787354, "num_tokens": 77402269.0, "step": 2024 }, { "epoch": 0.25760081414578295, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.885568618774414, "learning_rate": 8.579906740144128e-07, "loss": 0.4943, "mean_token_accuracy": 0.848667562007904, "num_tokens": 77445791.0, "step": 2025 }, { "epoch": 0.2577280244243735, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 21.07828140258789, "learning_rate": 8.584145824501907e-07, "loss": 0.5276, "mean_token_accuracy": 0.8371331691741943, "num_tokens": 77485376.0, "step": 2026 }, { "epoch": 0.257855234702964, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 2.396106719970703e-05, "grad_norm": 20.645784378051758, "learning_rate": 8.588384908859686e-07, "loss": 0.5253, "mean_token_accuracy": 0.8407809734344482, "num_tokens": 77524344.0, "step": 2027 }, { "epoch": 0.25798244498155454, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4557113647460938e-05, "grad_norm": 21.466442108154297, "learning_rate": 8.592623993217465e-07, "loss": 0.5208, "mean_token_accuracy": 0.842576265335083, "num_tokens": 77562746.0, "step": 2028 }, { "epoch": 0.258109655260145, "ewc_loss": 0.03173828125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4080276489257812e-05, "grad_norm": 20.54645347595215, "learning_rate": 8.596863077575244e-07, "loss": 0.5415, "mean_token_accuracy": 0.8354895114898682, "num_tokens": 77601261.0, "step": 2029 }, { "epoch": 0.25823686553873554, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.47955322265625e-05, "grad_norm": 21.09265899658203, "learning_rate": 8.601102161933023e-07, "loss": 0.5236, "mean_token_accuracy": 0.8399668335914612, "num_tokens": 77640054.0, "step": 2030 }, { "epoch": 0.25836407581732607, "ewc_loss": 0.031982421875, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4199485778808594e-05, "grad_norm": 21.032947540283203, "learning_rate": 8.6053412462908e-07, "loss": 0.4967, "mean_token_accuracy": 0.8475003838539124, "num_tokens": 77680098.0, "step": 2031 }, { "epoch": 0.25849128609591654, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 20.982847213745117, "learning_rate": 8.60958033064858e-07, "loss": 0.4889, "mean_token_accuracy": 0.8506404161453247, "num_tokens": 77712517.0, "step": 2032 }, { "epoch": 0.25861849637450707, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4318695068359375e-05, "grad_norm": 20.814373016357422, "learning_rate": 8.613819415006358e-07, "loss": 0.5531, "mean_token_accuracy": 0.8366722464561462, "num_tokens": 77751335.0, "step": 2033 }, { "epoch": 0.2587457066530976, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 21.015893936157227, "learning_rate": 8.618058499364137e-07, "loss": 0.5385, "mean_token_accuracy": 0.8347886800765991, "num_tokens": 77792300.0, "step": 2034 }, { "epoch": 0.25887291693168807, "ewc_loss": 0.0322265625, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.4437904357910156e-05, "grad_norm": 20.88245391845703, "learning_rate": 8.622297583721916e-07, "loss": 0.497, "mean_token_accuracy": 0.8503001928329468, "num_tokens": 77836391.0, "step": 2035 }, { "epoch": 0.2590001272102786, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.47955322265625e-05, "grad_norm": 20.906211853027344, "learning_rate": 8.626536668079695e-07, "loss": 0.5091, "mean_token_accuracy": 0.8413053750991821, "num_tokens": 77871600.0, "step": 2036 }, { "epoch": 0.2591273374888691, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 20.927671432495117, "learning_rate": 8.630775752437474e-07, "loss": 0.4987, "mean_token_accuracy": 0.847411572933197, "num_tokens": 77904623.0, "step": 2037 }, { "epoch": 0.2592545477674596, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.47955322265625e-05, "grad_norm": 20.625625610351562, "learning_rate": 8.635014836795251e-07, "loss": 0.5372, "mean_token_accuracy": 0.836286187171936, "num_tokens": 77941145.0, "step": 2038 }, { "epoch": 0.2593817580460501, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.049739837646484, "learning_rate": 8.63925392115303e-07, "loss": 0.4748, "mean_token_accuracy": 0.8524529933929443, "num_tokens": 77975087.0, "step": 2039 }, { "epoch": 0.25950896832464065, "ewc_loss": 0.032470703125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.467632293701172e-05, "grad_norm": 20.75774574279785, "learning_rate": 8.643493005510809e-07, "loss": 0.482, "mean_token_accuracy": 0.8505391478538513, "num_tokens": 78009526.0, "step": 2040 }, { "epoch": 0.2596361786032311, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 20.96986961364746, "learning_rate": 8.647732089868588e-07, "loss": 0.5597, "mean_token_accuracy": 0.8288479447364807, "num_tokens": 78050918.0, "step": 2041 }, { "epoch": 0.25976338888182166, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.491474151611328e-05, "grad_norm": 20.796661376953125, "learning_rate": 8.651971174226366e-07, "loss": 0.4761, "mean_token_accuracy": 0.8537163734436035, "num_tokens": 78085547.0, "step": 2042 }, { "epoch": 0.2598905991604122, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 20.90944480895996, "learning_rate": 8.656210258584146e-07, "loss": 0.5458, "mean_token_accuracy": 0.8319536447525024, "num_tokens": 78122916.0, "step": 2043 }, { "epoch": 0.26001780943900266, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5033950805664062e-05, "grad_norm": 21.13486671447754, "learning_rate": 8.660449342941924e-07, "loss": 0.539, "mean_token_accuracy": 0.8325254917144775, "num_tokens": 78161800.0, "step": 2044 }, { "epoch": 0.2601450197175932, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 21.026039123535156, "learning_rate": 8.664688427299704e-07, "loss": 0.5167, "mean_token_accuracy": 0.8411990404129028, "num_tokens": 78199961.0, "step": 2045 }, { "epoch": 0.2602722299961837, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.22577476501465, "learning_rate": 8.668927511657481e-07, "loss": 0.4737, "mean_token_accuracy": 0.8530804514884949, "num_tokens": 78231332.0, "step": 2046 }, { "epoch": 0.2603994402747742, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.115215301513672, "learning_rate": 8.67316659601526e-07, "loss": 0.5304, "mean_token_accuracy": 0.8361383676528931, "num_tokens": 78265054.0, "step": 2047 }, { "epoch": 0.2605266505533647, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 21.05387306213379, "learning_rate": 8.677405680373039e-07, "loss": 0.4877, "mean_token_accuracy": 0.852808952331543, "num_tokens": 78305445.0, "step": 2048 }, { "epoch": 0.26065386083195524, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.491474151611328e-05, "grad_norm": 21.095163345336914, "learning_rate": 8.681644764730818e-07, "loss": 0.5173, "mean_token_accuracy": 0.841035008430481, "num_tokens": 78339174.0, "step": 2049 }, { "epoch": 0.2607810711105457, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.491474151611328e-05, "grad_norm": 20.882875442504883, "learning_rate": 8.685883849088596e-07, "loss": 0.4585, "mean_token_accuracy": 0.859413743019104, "num_tokens": 78377630.0, "step": 2050 }, { "epoch": 0.26090828138913624, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.186307907104492, "learning_rate": 8.690122933446376e-07, "loss": 0.5145, "mean_token_accuracy": 0.8455421924591064, "num_tokens": 78419908.0, "step": 2051 }, { "epoch": 0.26103549166772677, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 20.940780639648438, "learning_rate": 8.694362017804154e-07, "loss": 0.4812, "mean_token_accuracy": 0.853652834892273, "num_tokens": 78459561.0, "step": 2052 }, { "epoch": 0.26116270194631724, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.033267974853516, "learning_rate": 8.698601102161933e-07, "loss": 0.5141, "mean_token_accuracy": 0.8427216410636902, "num_tokens": 78502500.0, "step": 2053 }, { "epoch": 0.26128991222490777, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.520925521850586, "learning_rate": 8.702840186519711e-07, "loss": 0.4705, "mean_token_accuracy": 0.8549848794937134, "num_tokens": 78545411.0, "step": 2054 }, { "epoch": 0.2614171225034983, "ewc_loss": 0.03271484375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5033950805664062e-05, "grad_norm": 20.902587890625, "learning_rate": 8.70707927087749e-07, "loss": 0.4689, "mean_token_accuracy": 0.8560612797737122, "num_tokens": 78585878.0, "step": 2055 }, { "epoch": 0.2615443327820888, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.164443969726562, "learning_rate": 8.711318355235269e-07, "loss": 0.5593, "mean_token_accuracy": 0.8261680006980896, "num_tokens": 78621233.0, "step": 2056 }, { "epoch": 0.2616715430606793, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5033950805664062e-05, "grad_norm": 21.05866050720215, "learning_rate": 8.715557439593047e-07, "loss": 0.4789, "mean_token_accuracy": 0.8507004976272583, "num_tokens": 78654644.0, "step": 2057 }, { "epoch": 0.26179875333926983, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 20.921472549438477, "learning_rate": 8.719796523950826e-07, "loss": 0.5575, "mean_token_accuracy": 0.8277868032455444, "num_tokens": 78692851.0, "step": 2058 }, { "epoch": 0.2619259636178603, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.230369567871094, "learning_rate": 8.724035608308605e-07, "loss": 0.4513, "mean_token_accuracy": 0.8616067171096802, "num_tokens": 78725329.0, "step": 2059 }, { "epoch": 0.26205317389645083, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.082679748535156, "learning_rate": 8.728274692666384e-07, "loss": 0.5074, "mean_token_accuracy": 0.8438834547996521, "num_tokens": 78766645.0, "step": 2060 }, { "epoch": 0.26218038417504136, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 21.11827850341797, "learning_rate": 8.732513777024162e-07, "loss": 0.4996, "mean_token_accuracy": 0.8475784063339233, "num_tokens": 78801576.0, "step": 2061 }, { "epoch": 0.26230759445363183, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5153160095214844e-05, "grad_norm": 21.285396575927734, "learning_rate": 8.736752861381941e-07, "loss": 0.5458, "mean_token_accuracy": 0.8325638771057129, "num_tokens": 78845577.0, "step": 2062 }, { "epoch": 0.26243480473222236, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.08141326904297, "learning_rate": 8.740991945739719e-07, "loss": 0.511, "mean_token_accuracy": 0.8439178466796875, "num_tokens": 78884229.0, "step": 2063 }, { "epoch": 0.2625620150108129, "ewc_loss": 0.032958984375, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5033950805664062e-05, "grad_norm": 21.628938674926758, "learning_rate": 8.745231030097499e-07, "loss": 0.4959, "mean_token_accuracy": 0.8485449552536011, "num_tokens": 78926948.0, "step": 2064 }, { "epoch": 0.26268922528940336, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.30962562561035, "learning_rate": 8.749470114455277e-07, "loss": 0.4723, "mean_token_accuracy": 0.8534228205680847, "num_tokens": 78963582.0, "step": 2065 }, { "epoch": 0.2628164355679939, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.185199737548828, "learning_rate": 8.753709198813056e-07, "loss": 0.5816, "mean_token_accuracy": 0.8222624659538269, "num_tokens": 79007345.0, "step": 2066 }, { "epoch": 0.2629436458465844, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.291576385498047, "learning_rate": 8.757948283170835e-07, "loss": 0.4775, "mean_token_accuracy": 0.855554461479187, "num_tokens": 79049505.0, "step": 2067 }, { "epoch": 0.2630708561251749, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.18641471862793, "learning_rate": 8.762187367528613e-07, "loss": 0.4728, "mean_token_accuracy": 0.8526824712753296, "num_tokens": 79087419.0, "step": 2068 }, { "epoch": 0.2631980664037654, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5272369384765625e-05, "grad_norm": 21.082197189331055, "learning_rate": 8.766426451886392e-07, "loss": 0.5203, "mean_token_accuracy": 0.838528037071228, "num_tokens": 79131444.0, "step": 2069 }, { "epoch": 0.26332527668235595, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.562999725341797e-05, "grad_norm": 21.216495513916016, "learning_rate": 8.770665536244171e-07, "loss": 0.5478, "mean_token_accuracy": 0.8359054327011108, "num_tokens": 79171976.0, "step": 2070 }, { "epoch": 0.2634524869609464, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 21.15412139892578, "learning_rate": 8.774904620601949e-07, "loss": 0.475, "mean_token_accuracy": 0.8527939915657043, "num_tokens": 79206273.0, "step": 2071 }, { "epoch": 0.26357969723953695, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.562999725341797e-05, "grad_norm": 21.27859115600586, "learning_rate": 8.779143704959729e-07, "loss": 0.4794, "mean_token_accuracy": 0.8527618646621704, "num_tokens": 79245564.0, "step": 2072 }, { "epoch": 0.2637069075181275, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 20.913455963134766, "learning_rate": 8.783382789317507e-07, "loss": 0.5047, "mean_token_accuracy": 0.8449820876121521, "num_tokens": 79281857.0, "step": 2073 }, { "epoch": 0.26383411779671795, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.574920654296875e-05, "grad_norm": 21.375423431396484, "learning_rate": 8.787621873675286e-07, "loss": 0.5425, "mean_token_accuracy": 0.8299901485443115, "num_tokens": 79312482.0, "step": 2074 }, { "epoch": 0.2639613280753085, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.562999725341797e-05, "grad_norm": 21.333498001098633, "learning_rate": 8.791860958033065e-07, "loss": 0.4811, "mean_token_accuracy": 0.8534277081489563, "num_tokens": 79348157.0, "step": 2075 }, { "epoch": 0.264088538353899, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.353689193725586, "learning_rate": 8.796100042390842e-07, "loss": 0.4667, "mean_token_accuracy": 0.856977641582489, "num_tokens": 79387540.0, "step": 2076 }, { "epoch": 0.2642157486324895, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.574920654296875e-05, "grad_norm": 21.165416717529297, "learning_rate": 8.800339126748622e-07, "loss": 0.4858, "mean_token_accuracy": 0.8498029708862305, "num_tokens": 79421424.0, "step": 2077 }, { "epoch": 0.26434295891108, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 21.272676467895508, "learning_rate": 8.8045782111064e-07, "loss": 0.556, "mean_token_accuracy": 0.8287626504898071, "num_tokens": 79458733.0, "step": 2078 }, { "epoch": 0.26447016918967053, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.519573211669922, "learning_rate": 8.808817295464179e-07, "loss": 0.5086, "mean_token_accuracy": 0.8466061949729919, "num_tokens": 79495394.0, "step": 2079 }, { "epoch": 0.26459737946826106, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.30546760559082, "learning_rate": 8.813056379821958e-07, "loss": 0.4782, "mean_token_accuracy": 0.8545520901679993, "num_tokens": 79529799.0, "step": 2080 }, { "epoch": 0.26472458974685154, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 2.574920654296875e-05, "grad_norm": 21.27413558959961, "learning_rate": 8.817295464179737e-07, "loss": 0.5046, "mean_token_accuracy": 0.8440815210342407, "num_tokens": 79564664.0, "step": 2081 }, { "epoch": 0.26485180002544206, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.574920654296875e-05, "grad_norm": 21.520851135253906, "learning_rate": 8.821534548537515e-07, "loss": 0.5836, "mean_token_accuracy": 0.8193337917327881, "num_tokens": 79606104.0, "step": 2082 }, { "epoch": 0.2649790103040326, "ewc_loss": 0.033203125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5391578674316406e-05, "grad_norm": 20.987895965576172, "learning_rate": 8.825773632895295e-07, "loss": 0.4766, "mean_token_accuracy": 0.8543379306793213, "num_tokens": 79647688.0, "step": 2083 }, { "epoch": 0.26510622058262306, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.586841583251953e-05, "grad_norm": 21.361572265625, "learning_rate": 8.830012717253072e-07, "loss": 0.495, "mean_token_accuracy": 0.8501412272453308, "num_tokens": 79685500.0, "step": 2084 }, { "epoch": 0.2652334308612136, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.166662216186523, "learning_rate": 8.834251801610852e-07, "loss": 0.5236, "mean_token_accuracy": 0.838939905166626, "num_tokens": 79729912.0, "step": 2085 }, { "epoch": 0.2653606411398041, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.562999725341797e-05, "grad_norm": 21.179054260253906, "learning_rate": 8.83849088596863e-07, "loss": 0.481, "mean_token_accuracy": 0.8536058068275452, "num_tokens": 79767487.0, "step": 2086 }, { "epoch": 0.2654878514183946, "ewc_loss": 0.033447265625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5510787963867188e-05, "grad_norm": 21.200313568115234, "learning_rate": 8.842729970326409e-07, "loss": 0.4579, "mean_token_accuracy": 0.8620293140411377, "num_tokens": 79804802.0, "step": 2087 }, { "epoch": 0.2656150616969851, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.574920654296875e-05, "grad_norm": 21.143247604370117, "learning_rate": 8.846969054684188e-07, "loss": 0.5775, "mean_token_accuracy": 0.8261789083480835, "num_tokens": 79845685.0, "step": 2088 }, { "epoch": 0.26574227197557565, "ewc_loss": 0.033935546875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.5987625122070312e-05, "grad_norm": 21.617313385009766, "learning_rate": 8.851208139041967e-07, "loss": 0.4679, "mean_token_accuracy": 0.854831337928772, "num_tokens": 79884692.0, "step": 2089 }, { "epoch": 0.2658694822541661, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.586841583251953e-05, "grad_norm": 20.87154197692871, "learning_rate": 8.855447223399745e-07, "loss": 0.527, "mean_token_accuracy": 0.8364255428314209, "num_tokens": 79919780.0, "step": 2090 }, { "epoch": 0.26599669253275665, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6226043701171875e-05, "grad_norm": 21.543283462524414, "learning_rate": 8.859686307757524e-07, "loss": 0.5518, "mean_token_accuracy": 0.8310023546218872, "num_tokens": 79958317.0, "step": 2091 }, { "epoch": 0.2661239028113472, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.586841583251953e-05, "grad_norm": 21.372346878051758, "learning_rate": 8.863925392115302e-07, "loss": 0.484, "mean_token_accuracy": 0.8504612445831299, "num_tokens": 79994126.0, "step": 2092 }, { "epoch": 0.26625111308993765, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.586841583251953e-05, "grad_norm": 21.210678100585938, "learning_rate": 8.868164476473082e-07, "loss": 0.4549, "mean_token_accuracy": 0.8616694211959839, "num_tokens": 80033157.0, "step": 2093 }, { "epoch": 0.2663783233685282, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.063425064086914, "learning_rate": 8.87240356083086e-07, "loss": 0.483, "mean_token_accuracy": 0.8535229563713074, "num_tokens": 80073354.0, "step": 2094 }, { "epoch": 0.2665055336471187, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.42526626586914, "learning_rate": 8.876642645188639e-07, "loss": 0.5456, "mean_token_accuracy": 0.8331428170204163, "num_tokens": 80111819.0, "step": 2095 }, { "epoch": 0.2666327439257092, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.292448043823242, "learning_rate": 8.880881729546418e-07, "loss": 0.4605, "mean_token_accuracy": 0.8584235906600952, "num_tokens": 80147951.0, "step": 2096 }, { "epoch": 0.2667599542042997, "ewc_loss": 0.03369140625, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.586841583251953e-05, "grad_norm": 21.4979248046875, "learning_rate": 8.885120813904197e-07, "loss": 0.4721, "mean_token_accuracy": 0.8538864850997925, "num_tokens": 80191319.0, "step": 2097 }, { "epoch": 0.26688716448289024, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 20.958175659179688, "learning_rate": 8.889359898261976e-07, "loss": 0.5616, "mean_token_accuracy": 0.8316607475280762, "num_tokens": 80226300.0, "step": 2098 }, { "epoch": 0.2670143747614807, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.634559631347656, "learning_rate": 8.893598982619753e-07, "loss": 0.4857, "mean_token_accuracy": 0.8512451648712158, "num_tokens": 80265465.0, "step": 2099 }, { "epoch": 0.26714158504007124, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6226043701171875e-05, "grad_norm": 21.29325294494629, "learning_rate": 8.897838066977532e-07, "loss": 0.4864, "mean_token_accuracy": 0.8513956665992737, "num_tokens": 80312696.0, "step": 2100 }, { "epoch": 0.26726879531866177, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.74942970275879, "learning_rate": 8.902077151335311e-07, "loss": 0.5097, "mean_token_accuracy": 0.8425403833389282, "num_tokens": 80353507.0, "step": 2101 }, { "epoch": 0.26739600559725224, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.213619232177734, "learning_rate": 8.90631623569309e-07, "loss": 0.592, "mean_token_accuracy": 0.8167800307273865, "num_tokens": 80390795.0, "step": 2102 }, { "epoch": 0.26752321587584277, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.290950775146484, "learning_rate": 8.910555320050868e-07, "loss": 0.5006, "mean_token_accuracy": 0.8455634117126465, "num_tokens": 80427599.0, "step": 2103 }, { "epoch": 0.2676504261544333, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.436141967773438, "learning_rate": 8.914794404408648e-07, "loss": 0.504, "mean_token_accuracy": 0.8466511964797974, "num_tokens": 80468084.0, "step": 2104 }, { "epoch": 0.26777763643302377, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.282136917114258, "learning_rate": 8.919033488766426e-07, "loss": 0.4487, "mean_token_accuracy": 0.8633596897125244, "num_tokens": 80507218.0, "step": 2105 }, { "epoch": 0.2679048467116143, "ewc_loss": 0.034423828125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.523637771606445, "learning_rate": 8.923272573124204e-07, "loss": 0.5321, "mean_token_accuracy": 0.838336706161499, "num_tokens": 80541730.0, "step": 2106 }, { "epoch": 0.2680320569902048, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.3370418548584, "learning_rate": 8.927511657481983e-07, "loss": 0.5426, "mean_token_accuracy": 0.8329876661300659, "num_tokens": 80584677.0, "step": 2107 }, { "epoch": 0.2681592672687953, "ewc_loss": 0.034423828125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.266630172729492, "learning_rate": 8.931750741839762e-07, "loss": 0.4784, "mean_token_accuracy": 0.8550428748130798, "num_tokens": 80616332.0, "step": 2108 }, { "epoch": 0.2682864775473858, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.318740844726562, "learning_rate": 8.935989826197541e-07, "loss": 0.4854, "mean_token_accuracy": 0.8530329465866089, "num_tokens": 80654435.0, "step": 2109 }, { "epoch": 0.26841368782597635, "ewc_loss": 0.0341796875, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.355192184448242, "learning_rate": 8.94022891055532e-07, "loss": 0.4626, "mean_token_accuracy": 0.862358570098877, "num_tokens": 80691246.0, "step": 2110 }, { "epoch": 0.2685408981045668, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.46246337890625, "learning_rate": 8.944467994913098e-07, "loss": 0.5144, "mean_token_accuracy": 0.8388357758522034, "num_tokens": 80730510.0, "step": 2111 }, { "epoch": 0.26866810838315736, "ewc_loss": 0.034423828125, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.352046966552734, "learning_rate": 8.948707079270878e-07, "loss": 0.4313, "mean_token_accuracy": 0.8669114112854004, "num_tokens": 80769369.0, "step": 2112 }, { "epoch": 0.2687953186617479, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6702880859375e-05, "grad_norm": 21.699298858642578, "learning_rate": 8.952946163628656e-07, "loss": 0.4875, "mean_token_accuracy": 0.847809374332428, "num_tokens": 80802168.0, "step": 2113 }, { "epoch": 0.26892252894033836, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.338886260986328, "learning_rate": 8.957185247986434e-07, "loss": 0.4366, "mean_token_accuracy": 0.8664987683296204, "num_tokens": 80839389.0, "step": 2114 }, { "epoch": 0.2690497392189289, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.37265396118164, "learning_rate": 8.961424332344213e-07, "loss": 0.4918, "mean_token_accuracy": 0.8514775037765503, "num_tokens": 80881139.0, "step": 2115 }, { "epoch": 0.2691769494975194, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.5196590423584, "learning_rate": 8.965663416701992e-07, "loss": 0.4952, "mean_token_accuracy": 0.8472856283187866, "num_tokens": 80919809.0, "step": 2116 }, { "epoch": 0.2693041597761099, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.46454429626465, "learning_rate": 8.969902501059771e-07, "loss": 0.47, "mean_token_accuracy": 0.8584213852882385, "num_tokens": 80956860.0, "step": 2117 }, { "epoch": 0.2694313700547004, "ewc_loss": 0.034423828125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.888723373413086, "learning_rate": 8.97414158541755e-07, "loss": 0.437, "mean_token_accuracy": 0.8663380146026611, "num_tokens": 80994322.0, "step": 2118 }, { "epoch": 0.26955858033329094, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6702880859375e-05, "grad_norm": 21.302734375, "learning_rate": 8.978380669775328e-07, "loss": 0.5209, "mean_token_accuracy": 0.8397312760353088, "num_tokens": 81032792.0, "step": 2119 }, { "epoch": 0.2696857906118814, "ewc_loss": 0.0341796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6226043701171875e-05, "grad_norm": 21.925643920898438, "learning_rate": 8.982619754133107e-07, "loss": 0.5223, "mean_token_accuracy": 0.8403955698013306, "num_tokens": 81070011.0, "step": 2120 }, { "epoch": 0.26981300089047194, "ewc_loss": 0.034423828125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.911680221557617, "learning_rate": 8.986858838490886e-07, "loss": 0.5231, "mean_token_accuracy": 0.8406364917755127, "num_tokens": 81113145.0, "step": 2121 }, { "epoch": 0.26994021116906247, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.822826385498047, "learning_rate": 8.991097922848663e-07, "loss": 0.4931, "mean_token_accuracy": 0.849281370639801, "num_tokens": 81154175.0, "step": 2122 }, { "epoch": 0.27006742144765294, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 22.081615447998047, "learning_rate": 8.995337007206443e-07, "loss": 0.5162, "mean_token_accuracy": 0.8436880707740784, "num_tokens": 81193319.0, "step": 2123 }, { "epoch": 0.2701946317262435, "ewc_loss": 0.034423828125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.41139030456543, "learning_rate": 8.999576091564221e-07, "loss": 0.4824, "mean_token_accuracy": 0.8520645499229431, "num_tokens": 81229057.0, "step": 2124 }, { "epoch": 0.270321842004834, "ewc_loss": 0.034423828125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6345252990722656e-05, "grad_norm": 21.817604064941406, "learning_rate": 9.003815175922001e-07, "loss": 0.4923, "mean_token_accuracy": 0.8488634824752808, "num_tokens": 81262380.0, "step": 2125 }, { "epoch": 0.2704490522834245, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 22.176515579223633, "learning_rate": 9.008054260279779e-07, "loss": 0.495, "mean_token_accuracy": 0.8491097688674927, "num_tokens": 81304080.0, "step": 2126 }, { "epoch": 0.270576262562015, "ewc_loss": 0.0341796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6106834411621094e-05, "grad_norm": 21.188268661499023, "learning_rate": 9.012293344637558e-07, "loss": 0.5265, "mean_token_accuracy": 0.8414080142974854, "num_tokens": 81340226.0, "step": 2127 }, { "epoch": 0.27070347284060553, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 22.22704315185547, "learning_rate": 9.016532428995337e-07, "loss": 0.4731, "mean_token_accuracy": 0.8537386059761047, "num_tokens": 81378046.0, "step": 2128 }, { "epoch": 0.27083068311919606, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.504179000854492, "learning_rate": 9.020771513353115e-07, "loss": 0.5159, "mean_token_accuracy": 0.842028021812439, "num_tokens": 81419486.0, "step": 2129 }, { "epoch": 0.27095789339778653, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6464462280273438e-05, "grad_norm": 21.83888053894043, "learning_rate": 9.025010597710894e-07, "loss": 0.5167, "mean_token_accuracy": 0.8412390947341919, "num_tokens": 81457488.0, "step": 2130 }, { "epoch": 0.27108510367637706, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.566715240478516, "learning_rate": 9.029249682068673e-07, "loss": 0.4852, "mean_token_accuracy": 0.8515462875366211, "num_tokens": 81494812.0, "step": 2131 }, { "epoch": 0.2712123139549676, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6702880859375e-05, "grad_norm": 22.26274299621582, "learning_rate": 9.033488766426451e-07, "loss": 0.5032, "mean_token_accuracy": 0.844444751739502, "num_tokens": 81531421.0, "step": 2132 }, { "epoch": 0.27133952423355806, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.425735473632812, "learning_rate": 9.037727850784231e-07, "loss": 0.4671, "mean_token_accuracy": 0.8548424243927002, "num_tokens": 81567221.0, "step": 2133 }, { "epoch": 0.2714667345121486, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.6075496673584, "learning_rate": 9.041966935142009e-07, "loss": 0.5005, "mean_token_accuracy": 0.8468745946884155, "num_tokens": 81605832.0, "step": 2134 }, { "epoch": 0.2715939447907391, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6702880859375e-05, "grad_norm": 21.7415714263916, "learning_rate": 9.046206019499788e-07, "loss": 0.4556, "mean_token_accuracy": 0.862076997756958, "num_tokens": 81646919.0, "step": 2135 }, { "epoch": 0.2717211550693296, "ewc_loss": 0.03466796875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.658367156982422e-05, "grad_norm": 21.725868225097656, "learning_rate": 9.050445103857567e-07, "loss": 0.5364, "mean_token_accuracy": 0.8405951857566833, "num_tokens": 81687214.0, "step": 2136 }, { "epoch": 0.2718483653479201, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 21.625904083251953, "learning_rate": 9.054684188215344e-07, "loss": 0.4405, "mean_token_accuracy": 0.8661591410636902, "num_tokens": 81721437.0, "step": 2137 }, { "epoch": 0.27197557562651065, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 21.922225952148438, "learning_rate": 9.058923272573124e-07, "loss": 0.5139, "mean_token_accuracy": 0.8420690298080444, "num_tokens": 81756595.0, "step": 2138 }, { "epoch": 0.2721027859051011, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 21.523357391357422, "learning_rate": 9.063162356930902e-07, "loss": 0.4791, "mean_token_accuracy": 0.8547341227531433, "num_tokens": 81796198.0, "step": 2139 }, { "epoch": 0.27222999618369165, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 21.997325897216797, "learning_rate": 9.067401441288681e-07, "loss": 0.5098, "mean_token_accuracy": 0.8467603921890259, "num_tokens": 81833002.0, "step": 2140 }, { "epoch": 0.2723572064622822, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 22.064865112304688, "learning_rate": 9.07164052564646e-07, "loss": 0.4809, "mean_token_accuracy": 0.8486191034317017, "num_tokens": 81872089.0, "step": 2141 }, { "epoch": 0.27248441674087265, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6941299438476562e-05, "grad_norm": 21.738019943237305, "learning_rate": 9.075879610004239e-07, "loss": 0.5163, "mean_token_accuracy": 0.8413487672805786, "num_tokens": 81908356.0, "step": 2142 }, { "epoch": 0.2726116270194632, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 22.000017166137695, "learning_rate": 9.080118694362017e-07, "loss": 0.5217, "mean_token_accuracy": 0.8380812406539917, "num_tokens": 81950251.0, "step": 2143 }, { "epoch": 0.2727388372980537, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6941299438476562e-05, "grad_norm": 21.74663734436035, "learning_rate": 9.084357778719796e-07, "loss": 0.5147, "mean_token_accuracy": 0.8397654294967651, "num_tokens": 81987157.0, "step": 2144 }, { "epoch": 0.2728660475766442, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 22.129009246826172, "learning_rate": 9.088596863077574e-07, "loss": 0.4981, "mean_token_accuracy": 0.8516907691955566, "num_tokens": 82023633.0, "step": 2145 }, { "epoch": 0.2729932578552347, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 21.497968673706055, "learning_rate": 9.092835947435354e-07, "loss": 0.5022, "mean_token_accuracy": 0.8436265587806702, "num_tokens": 82061311.0, "step": 2146 }, { "epoch": 0.27312046813382523, "ewc_loss": 0.035400390625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7298927307128906e-05, "grad_norm": 21.970500946044922, "learning_rate": 9.097075031793132e-07, "loss": 0.5353, "mean_token_accuracy": 0.8374036550521851, "num_tokens": 82098370.0, "step": 2147 }, { "epoch": 0.2732476784124157, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 21.865550994873047, "learning_rate": 9.101314116150911e-07, "loss": 0.4631, "mean_token_accuracy": 0.8616479635238647, "num_tokens": 82134931.0, "step": 2148 }, { "epoch": 0.27337488869100623, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 21.61025047302246, "learning_rate": 9.10555320050869e-07, "loss": 0.4716, "mean_token_accuracy": 0.8561167120933533, "num_tokens": 82171391.0, "step": 2149 }, { "epoch": 0.27350209896959676, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 22.316823959350586, "learning_rate": 9.109792284866469e-07, "loss": 0.4897, "mean_token_accuracy": 0.8532582521438599, "num_tokens": 82214064.0, "step": 2150 }, { "epoch": 0.27362930924818724, "ewc_loss": 0.034912109375, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.682209014892578e-05, "grad_norm": 21.67273712158203, "learning_rate": 9.114031369224247e-07, "loss": 0.5075, "mean_token_accuracy": 0.8457211852073669, "num_tokens": 82252774.0, "step": 2151 }, { "epoch": 0.27375651952677776, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6941299438476562e-05, "grad_norm": 21.961841583251953, "learning_rate": 9.118270453582026e-07, "loss": 0.5619, "mean_token_accuracy": 0.8320831060409546, "num_tokens": 82291645.0, "step": 2152 }, { "epoch": 0.2738837298053683, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 21.708253860473633, "learning_rate": 9.122509537939804e-07, "loss": 0.4928, "mean_token_accuracy": 0.8445042371749878, "num_tokens": 82333643.0, "step": 2153 }, { "epoch": 0.27401094008395877, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.6941299438476562e-05, "grad_norm": 21.65192413330078, "learning_rate": 9.126748622297584e-07, "loss": 0.5323, "mean_token_accuracy": 0.8346842527389526, "num_tokens": 82370792.0, "step": 2154 }, { "epoch": 0.2741381503625493, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7179718017578125e-05, "grad_norm": 21.807069778442383, "learning_rate": 9.130987706655362e-07, "loss": 0.5239, "mean_token_accuracy": 0.8392886519432068, "num_tokens": 82408626.0, "step": 2155 }, { "epoch": 0.2742653606411398, "ewc_loss": 0.03515625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7060508728027344e-05, "grad_norm": 21.786876678466797, "learning_rate": 9.135226791013141e-07, "loss": 0.5421, "mean_token_accuracy": 0.839821994304657, "num_tokens": 82448401.0, "step": 2156 }, { "epoch": 0.2743925709197303, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.753734588623047e-05, "grad_norm": 21.797883987426758, "learning_rate": 9.13946587537092e-07, "loss": 0.4995, "mean_token_accuracy": 0.8437026739120483, "num_tokens": 82476644.0, "step": 2157 }, { "epoch": 0.2745197811983208, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.753734588623047e-05, "grad_norm": 21.74897575378418, "learning_rate": 9.143704959728699e-07, "loss": 0.5161, "mean_token_accuracy": 0.8375144600868225, "num_tokens": 82512722.0, "step": 2158 }, { "epoch": 0.27464699147691135, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7418136596679688e-05, "grad_norm": 21.89501190185547, "learning_rate": 9.147944044086476e-07, "loss": 0.5371, "mean_token_accuracy": 0.8360134363174438, "num_tokens": 82545393.0, "step": 2159 }, { "epoch": 0.2747742017555018, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.81295394897461, "learning_rate": 9.152183128444255e-07, "loss": 0.5331, "mean_token_accuracy": 0.8428146839141846, "num_tokens": 82586329.0, "step": 2160 }, { "epoch": 0.27490141203409235, "ewc_loss": 0.035400390625, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7298927307128906e-05, "grad_norm": 21.614959716796875, "learning_rate": 9.156422212802034e-07, "loss": 0.5397, "mean_token_accuracy": 0.8329208493232727, "num_tokens": 82625563.0, "step": 2161 }, { "epoch": 0.2750286223126829, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.751949310302734, "learning_rate": 9.160661297159813e-07, "loss": 0.4778, "mean_token_accuracy": 0.853646993637085, "num_tokens": 82660030.0, "step": 2162 }, { "epoch": 0.27515583259127335, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.765655517578125e-05, "grad_norm": 21.655494689941406, "learning_rate": 9.164900381517592e-07, "loss": 0.5006, "mean_token_accuracy": 0.8459357023239136, "num_tokens": 82698794.0, "step": 2163 }, { "epoch": 0.2752830428698639, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.820514678955078, "learning_rate": 9.16913946587537e-07, "loss": 0.4969, "mean_token_accuracy": 0.8465269804000854, "num_tokens": 82740015.0, "step": 2164 }, { "epoch": 0.2754102531484544, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.753734588623047e-05, "grad_norm": 22.02783966064453, "learning_rate": 9.17337855023315e-07, "loss": 0.5118, "mean_token_accuracy": 0.8424608707427979, "num_tokens": 82775765.0, "step": 2165 }, { "epoch": 0.2755374634270449, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.753734588623047e-05, "grad_norm": 21.74286651611328, "learning_rate": 9.177617634590928e-07, "loss": 0.4609, "mean_token_accuracy": 0.8590861558914185, "num_tokens": 82814324.0, "step": 2166 }, { "epoch": 0.2756646737056354, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.887859344482422, "learning_rate": 9.181856718948706e-07, "loss": 0.4488, "mean_token_accuracy": 0.8608567714691162, "num_tokens": 82851007.0, "step": 2167 }, { "epoch": 0.27579188398422594, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.810667037963867, "learning_rate": 9.186095803306485e-07, "loss": 0.4928, "mean_token_accuracy": 0.8473718762397766, "num_tokens": 82886846.0, "step": 2168 }, { "epoch": 0.2759190942628164, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.753734588623047e-05, "grad_norm": 21.817672729492188, "learning_rate": 9.190334887664264e-07, "loss": 0.5309, "mean_token_accuracy": 0.8387360572814941, "num_tokens": 82929607.0, "step": 2169 }, { "epoch": 0.27604630454140694, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.92779541015625, "learning_rate": 9.194573972022043e-07, "loss": 0.591, "mean_token_accuracy": 0.8268862962722778, "num_tokens": 82965089.0, "step": 2170 }, { "epoch": 0.27617351481999747, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.645565032958984, "learning_rate": 9.198813056379822e-07, "loss": 0.5634, "mean_token_accuracy": 0.8261318802833557, "num_tokens": 83008345.0, "step": 2171 }, { "epoch": 0.27630072509858794, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 22.415624618530273, "learning_rate": 9.2030521407376e-07, "loss": 0.5171, "mean_token_accuracy": 0.8402304649353027, "num_tokens": 83043763.0, "step": 2172 }, { "epoch": 0.27642793537717847, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 21.68208885192871, "learning_rate": 9.20729122509538e-07, "loss": 0.4617, "mean_token_accuracy": 0.8592925071716309, "num_tokens": 83081389.0, "step": 2173 }, { "epoch": 0.276555145655769, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 22.161479949951172, "learning_rate": 9.211530309453158e-07, "loss": 0.4997, "mean_token_accuracy": 0.8463259339332581, "num_tokens": 83120547.0, "step": 2174 }, { "epoch": 0.27668235593435947, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 22.141626358032227, "learning_rate": 9.215769393810936e-07, "loss": 0.5071, "mean_token_accuracy": 0.8476957082748413, "num_tokens": 83157229.0, "step": 2175 }, { "epoch": 0.27680956621295, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 2.765655517578125e-05, "grad_norm": 21.76759910583496, "learning_rate": 9.220008478168715e-07, "loss": 0.5859, "mean_token_accuracy": 0.8210735321044922, "num_tokens": 83198217.0, "step": 2176 }, { "epoch": 0.2769367764915405, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 22.159086227416992, "learning_rate": 9.224247562526494e-07, "loss": 0.5068, "mean_token_accuracy": 0.8432401418685913, "num_tokens": 83238750.0, "step": 2177 }, { "epoch": 0.277063986770131, "ewc_loss": 0.03564453125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7418136596679688e-05, "grad_norm": 21.78731918334961, "learning_rate": 9.228486646884273e-07, "loss": 0.4926, "mean_token_accuracy": 0.847983181476593, "num_tokens": 83280138.0, "step": 2178 }, { "epoch": 0.2771911970487215, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.940715789794922, "learning_rate": 9.232725731242052e-07, "loss": 0.5492, "mean_token_accuracy": 0.8319866061210632, "num_tokens": 83320633.0, "step": 2179 }, { "epoch": 0.27731840732731206, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 21.92629051208496, "learning_rate": 9.23696481559983e-07, "loss": 0.5294, "mean_token_accuracy": 0.8362780809402466, "num_tokens": 83358787.0, "step": 2180 }, { "epoch": 0.2774456176059026, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.765655517578125e-05, "grad_norm": 22.332889556884766, "learning_rate": 9.24120389995761e-07, "loss": 0.4726, "mean_token_accuracy": 0.854509711265564, "num_tokens": 83400609.0, "step": 2181 }, { "epoch": 0.27757282788449306, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 22.09966278076172, "learning_rate": 9.245442984315387e-07, "loss": 0.4888, "mean_token_accuracy": 0.8493648767471313, "num_tokens": 83436058.0, "step": 2182 }, { "epoch": 0.2777000381630836, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 21.948835372924805, "learning_rate": 9.249682068673165e-07, "loss": 0.5243, "mean_token_accuracy": 0.8378751277923584, "num_tokens": 83474542.0, "step": 2183 }, { "epoch": 0.2778272484416741, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 21.851272583007812, "learning_rate": 9.253921153030945e-07, "loss": 0.5628, "mean_token_accuracy": 0.8285149931907654, "num_tokens": 83515551.0, "step": 2184 }, { "epoch": 0.2779544587202646, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 22.286602020263672, "learning_rate": 9.258160237388723e-07, "loss": 0.4747, "mean_token_accuracy": 0.8570295572280884, "num_tokens": 83551826.0, "step": 2185 }, { "epoch": 0.2780816689988551, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.0364990234375, "learning_rate": 9.262399321746503e-07, "loss": 0.4948, "mean_token_accuracy": 0.8492608070373535, "num_tokens": 83586598.0, "step": 2186 }, { "epoch": 0.27820887927744564, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 22.064485549926758, "learning_rate": 9.266638406104281e-07, "loss": 0.5438, "mean_token_accuracy": 0.833527147769928, "num_tokens": 83625996.0, "step": 2187 }, { "epoch": 0.2783360895560361, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8371810913085938e-05, "grad_norm": 22.367691040039062, "learning_rate": 9.27087749046206e-07, "loss": 0.5126, "mean_token_accuracy": 0.8445377945899963, "num_tokens": 83657322.0, "step": 2188 }, { "epoch": 0.27846329983462664, "ewc_loss": 0.035888671875, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.777576446533203e-05, "grad_norm": 22.05022430419922, "learning_rate": 9.275116574819839e-07, "loss": 0.507, "mean_token_accuracy": 0.8440243601799011, "num_tokens": 83693578.0, "step": 2189 }, { "epoch": 0.27859051011321717, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.1328125, "learning_rate": 9.279355659177617e-07, "loss": 0.512, "mean_token_accuracy": 0.8450933694839478, "num_tokens": 83727672.0, "step": 2190 }, { "epoch": 0.27871772039180764, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.575462341308594, "learning_rate": 9.283594743535395e-07, "loss": 0.556, "mean_token_accuracy": 0.833777666091919, "num_tokens": 83761405.0, "step": 2191 }, { "epoch": 0.2788449306703982, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 21.738317489624023, "learning_rate": 9.287833827893175e-07, "loss": 0.4945, "mean_token_accuracy": 0.8451478481292725, "num_tokens": 83801564.0, "step": 2192 }, { "epoch": 0.2789721409489887, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.30355453491211, "learning_rate": 9.292072912250953e-07, "loss": 0.5242, "mean_token_accuracy": 0.839172899723053, "num_tokens": 83834260.0, "step": 2193 }, { "epoch": 0.2790993512275792, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 21.81838035583496, "learning_rate": 9.296311996608733e-07, "loss": 0.4516, "mean_token_accuracy": 0.8600172996520996, "num_tokens": 83873518.0, "step": 2194 }, { "epoch": 0.2792265615061697, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 22.393844604492188, "learning_rate": 9.300551080966511e-07, "loss": 0.5663, "mean_token_accuracy": 0.8257932662963867, "num_tokens": 83913902.0, "step": 2195 }, { "epoch": 0.27935377178476023, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.7894973754882812e-05, "grad_norm": 22.009925842285156, "learning_rate": 9.30479016532429e-07, "loss": 0.4933, "mean_token_accuracy": 0.845475435256958, "num_tokens": 83947952.0, "step": 2196 }, { "epoch": 0.2794809820633507, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.042261123657227, "learning_rate": 9.309029249682068e-07, "loss": 0.5031, "mean_token_accuracy": 0.8466137647628784, "num_tokens": 83986057.0, "step": 2197 }, { "epoch": 0.27960819234194123, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 22.18704605102539, "learning_rate": 9.313268334039847e-07, "loss": 0.5129, "mean_token_accuracy": 0.8430169820785522, "num_tokens": 84023881.0, "step": 2198 }, { "epoch": 0.27973540262053176, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.111675262451172, "learning_rate": 9.317507418397625e-07, "loss": 0.4451, "mean_token_accuracy": 0.8653066158294678, "num_tokens": 84061480.0, "step": 2199 }, { "epoch": 0.27986261289912223, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.073108673095703, "learning_rate": 9.321746502755404e-07, "loss": 0.5493, "mean_token_accuracy": 0.8338133096694946, "num_tokens": 84101782.0, "step": 2200 }, { "epoch": 0.27998982317771276, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.27412223815918, "learning_rate": 9.325985587113183e-07, "loss": 0.5063, "mean_token_accuracy": 0.8450615406036377, "num_tokens": 84136125.0, "step": 2201 }, { "epoch": 0.2801170334563033, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8371810913085938e-05, "grad_norm": 22.199182510375977, "learning_rate": 9.330224671470962e-07, "loss": 0.5211, "mean_token_accuracy": 0.8460530638694763, "num_tokens": 84169748.0, "step": 2202 }, { "epoch": 0.28024424373489376, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.371885299682617, "learning_rate": 9.334463755828741e-07, "loss": 0.5361, "mean_token_accuracy": 0.8334968686103821, "num_tokens": 84201292.0, "step": 2203 }, { "epoch": 0.2803714540134843, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8014183044433594e-05, "grad_norm": 22.22212028503418, "learning_rate": 9.338702840186519e-07, "loss": 0.536, "mean_token_accuracy": 0.8337010145187378, "num_tokens": 84240652.0, "step": 2204 }, { "epoch": 0.2804986642920748, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.162614822387695, "learning_rate": 9.342941924544298e-07, "loss": 0.5281, "mean_token_accuracy": 0.8398711085319519, "num_tokens": 84280252.0, "step": 2205 }, { "epoch": 0.2806258745706653, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.368410110473633, "learning_rate": 9.347181008902076e-07, "loss": 0.4647, "mean_token_accuracy": 0.8605486154556274, "num_tokens": 84316505.0, "step": 2206 }, { "epoch": 0.2807530848492558, "ewc_loss": 0.0361328125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.09790802001953, "learning_rate": 9.351420093259855e-07, "loss": 0.468, "mean_token_accuracy": 0.8560723066329956, "num_tokens": 84349990.0, "step": 2207 }, { "epoch": 0.28088029512784635, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.51569938659668, "learning_rate": 9.355659177617634e-07, "loss": 0.4573, "mean_token_accuracy": 0.8573896884918213, "num_tokens": 84381066.0, "step": 2208 }, { "epoch": 0.2810075054064368, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 21.80353355407715, "learning_rate": 9.359898261975413e-07, "loss": 0.4481, "mean_token_accuracy": 0.8625527024269104, "num_tokens": 84422540.0, "step": 2209 }, { "epoch": 0.28113471568502735, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8133392333984375e-05, "grad_norm": 22.25611686706543, "learning_rate": 9.364137346333192e-07, "loss": 0.5765, "mean_token_accuracy": 0.8251758813858032, "num_tokens": 84462158.0, "step": 2210 }, { "epoch": 0.2812619259636179, "ewc_loss": 0.03662109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8371810913085938e-05, "grad_norm": 22.33687973022461, "learning_rate": 9.368376430690971e-07, "loss": 0.4664, "mean_token_accuracy": 0.8559547066688538, "num_tokens": 84495887.0, "step": 2211 }, { "epoch": 0.28138913624220835, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.060733795166016, "learning_rate": 9.372615515048749e-07, "loss": 0.5399, "mean_token_accuracy": 0.8357607126235962, "num_tokens": 84530176.0, "step": 2212 }, { "epoch": 0.2815163465207989, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.255483627319336, "learning_rate": 9.376854599406528e-07, "loss": 0.54, "mean_token_accuracy": 0.8374311923980713, "num_tokens": 84574450.0, "step": 2213 }, { "epoch": 0.2816435567993894, "ewc_loss": 0.03662109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.849102020263672e-05, "grad_norm": 22.126344680786133, "learning_rate": 9.381093683764306e-07, "loss": 0.5087, "mean_token_accuracy": 0.8457965850830078, "num_tokens": 84610661.0, "step": 2214 }, { "epoch": 0.2817707670779799, "ewc_loss": 0.03662109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8371810913085938e-05, "grad_norm": 21.997316360473633, "learning_rate": 9.385332768122085e-07, "loss": 0.49, "mean_token_accuracy": 0.8486748337745667, "num_tokens": 84651259.0, "step": 2215 }, { "epoch": 0.2818979773565704, "ewc_loss": 0.03662109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8371810913085938e-05, "grad_norm": 22.17449951171875, "learning_rate": 9.389571852479864e-07, "loss": 0.4952, "mean_token_accuracy": 0.8491718769073486, "num_tokens": 84692389.0, "step": 2216 }, { "epoch": 0.28202518763516093, "ewc_loss": 0.036376953125, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8252601623535156e-05, "grad_norm": 22.37981605529785, "learning_rate": 9.393810936837643e-07, "loss": 0.4974, "mean_token_accuracy": 0.8481448292732239, "num_tokens": 84731905.0, "step": 2217 }, { "epoch": 0.2821523979137514, "ewc_loss": 0.03662109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.849102020263672e-05, "grad_norm": 21.909645080566406, "learning_rate": 9.398050021195422e-07, "loss": 0.5264, "mean_token_accuracy": 0.8437135219573975, "num_tokens": 84773727.0, "step": 2218 }, { "epoch": 0.28227960819234194, "ewc_loss": 0.036865234375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.86102294921875e-05, "grad_norm": 22.13725471496582, "learning_rate": 9.402289105553201e-07, "loss": 0.4711, "mean_token_accuracy": 0.8556045293807983, "num_tokens": 84816771.0, "step": 2219 }, { "epoch": 0.28240681847093246, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8967857360839844e-05, "grad_norm": 22.47526741027832, "learning_rate": 9.406528189910978e-07, "loss": 0.4967, "mean_token_accuracy": 0.8490324020385742, "num_tokens": 84853172.0, "step": 2220 }, { "epoch": 0.28253402874952294, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.872943878173828e-05, "grad_norm": 22.016530990600586, "learning_rate": 9.410767274268757e-07, "loss": 0.4782, "mean_token_accuracy": 0.8512060642242432, "num_tokens": 84891508.0, "step": 2221 }, { "epoch": 0.28266123902811346, "ewc_loss": 0.037353515625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9087066650390625e-05, "grad_norm": 22.340436935424805, "learning_rate": 9.415006358626536e-07, "loss": 0.5749, "mean_token_accuracy": 0.8243043422698975, "num_tokens": 84931204.0, "step": 2222 }, { "epoch": 0.282788449306704, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.006799697875977, "learning_rate": 9.419245442984314e-07, "loss": 0.5581, "mean_token_accuracy": 0.8307048082351685, "num_tokens": 84966782.0, "step": 2223 }, { "epoch": 0.28291565958529447, "ewc_loss": 0.037353515625, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.451995849609375, "learning_rate": 9.423484527342094e-07, "loss": 0.5025, "mean_token_accuracy": 0.8441061973571777, "num_tokens": 85001512.0, "step": 2224 }, { "epoch": 0.283042869863885, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.257305145263672, "learning_rate": 9.427723611699872e-07, "loss": 0.5107, "mean_token_accuracy": 0.8456152677536011, "num_tokens": 85036082.0, "step": 2225 }, { "epoch": 0.2831700801424755, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.156856536865234, "learning_rate": 9.431962696057652e-07, "loss": 0.5159, "mean_token_accuracy": 0.8431070446968079, "num_tokens": 85079433.0, "step": 2226 }, { "epoch": 0.283297290421066, "ewc_loss": 0.037353515625, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 2.9087066650390625e-05, "grad_norm": 22.463937759399414, "learning_rate": 9.43620178041543e-07, "loss": 0.4674, "mean_token_accuracy": 0.8592262268066406, "num_tokens": 85116940.0, "step": 2227 }, { "epoch": 0.2834245006996565, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.392492294311523, "learning_rate": 9.440440864773208e-07, "loss": 0.521, "mean_token_accuracy": 0.8423007726669312, "num_tokens": 85153328.0, "step": 2228 }, { "epoch": 0.28355171097824705, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.872943878173828e-05, "grad_norm": 22.49920082092285, "learning_rate": 9.444679949130987e-07, "loss": 0.5221, "mean_token_accuracy": 0.8388224840164185, "num_tokens": 85194059.0, "step": 2229 }, { "epoch": 0.2836789212568376, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.872943878173828e-05, "grad_norm": 22.589820861816406, "learning_rate": 9.448919033488766e-07, "loss": 0.5443, "mean_token_accuracy": 0.8371624946594238, "num_tokens": 85230820.0, "step": 2230 }, { "epoch": 0.28380613153542805, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.872943878173828e-05, "grad_norm": 22.59039306640625, "learning_rate": 9.453158117846544e-07, "loss": 0.5244, "mean_token_accuracy": 0.841377854347229, "num_tokens": 85268864.0, "step": 2231 }, { "epoch": 0.2839333418140186, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8967857360839844e-05, "grad_norm": 22.478317260742188, "learning_rate": 9.457397202204324e-07, "loss": 0.5534, "mean_token_accuracy": 0.8322913646697998, "num_tokens": 85311377.0, "step": 2232 }, { "epoch": 0.2840605520926091, "ewc_loss": 0.036865234375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.86102294921875e-05, "grad_norm": 22.1800537109375, "learning_rate": 9.461636286562102e-07, "loss": 0.4642, "mean_token_accuracy": 0.8607311248779297, "num_tokens": 85347082.0, "step": 2233 }, { "epoch": 0.2841877623711996, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.435827255249023, "learning_rate": 9.465875370919882e-07, "loss": 0.5557, "mean_token_accuracy": 0.83371502161026, "num_tokens": 85384696.0, "step": 2234 }, { "epoch": 0.2843149726497901, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.40273094177246, "learning_rate": 9.470114455277659e-07, "loss": 0.5406, "mean_token_accuracy": 0.8350785970687866, "num_tokens": 85420192.0, "step": 2235 }, { "epoch": 0.28444218292838064, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.36131477355957, "learning_rate": 9.474353539635438e-07, "loss": 0.4892, "mean_token_accuracy": 0.8489115238189697, "num_tokens": 85462281.0, "step": 2236 }, { "epoch": 0.2845693932069711, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8967857360839844e-05, "grad_norm": 22.848121643066406, "learning_rate": 9.478592623993217e-07, "loss": 0.4962, "mean_token_accuracy": 0.8473256826400757, "num_tokens": 85495191.0, "step": 2237 }, { "epoch": 0.28469660348556164, "ewc_loss": 0.036865234375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.86102294921875e-05, "grad_norm": 22.468624114990234, "learning_rate": 9.482831708350996e-07, "loss": 0.4926, "mean_token_accuracy": 0.8508449792861938, "num_tokens": 85533803.0, "step": 2238 }, { "epoch": 0.28482381376415217, "ewc_loss": 0.037353515625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9087066650390625e-05, "grad_norm": 22.295806884765625, "learning_rate": 9.487070792708775e-07, "loss": 0.5411, "mean_token_accuracy": 0.8374172449111938, "num_tokens": 85573424.0, "step": 2239 }, { "epoch": 0.28495102404274264, "ewc_loss": 0.037109375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.8848648071289062e-05, "grad_norm": 22.46714210510254, "learning_rate": 9.491309877066554e-07, "loss": 0.5238, "mean_token_accuracy": 0.8407292366027832, "num_tokens": 85616423.0, "step": 2240 }, { "epoch": 0.28507823432133317, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.20584487915039, "learning_rate": 9.495548961424332e-07, "loss": 0.5509, "mean_token_accuracy": 0.8319470882415771, "num_tokens": 85658410.0, "step": 2241 }, { "epoch": 0.2852054445999237, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.428089141845703, "learning_rate": 9.499788045782111e-07, "loss": 0.5083, "mean_token_accuracy": 0.8445999622344971, "num_tokens": 85695767.0, "step": 2242 }, { "epoch": 0.28533265487851417, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.4344539642334, "learning_rate": 9.504027130139889e-07, "loss": 0.5242, "mean_token_accuracy": 0.8403929471969604, "num_tokens": 85737771.0, "step": 2243 }, { "epoch": 0.2854598651571047, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.944469451904297e-05, "grad_norm": 22.039329528808594, "learning_rate": 9.508266214497667e-07, "loss": 0.4697, "mean_token_accuracy": 0.859146773815155, "num_tokens": 85774789.0, "step": 2244 }, { "epoch": 0.2855870754356952, "ewc_loss": 0.037353515625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9087066650390625e-05, "grad_norm": 22.904327392578125, "learning_rate": 9.512505298855447e-07, "loss": 0.5084, "mean_token_accuracy": 0.8477506637573242, "num_tokens": 85817833.0, "step": 2245 }, { "epoch": 0.2857142857142857, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.161685943603516, "learning_rate": 9.516744383213225e-07, "loss": 0.4527, "mean_token_accuracy": 0.8608886003494263, "num_tokens": 85854488.0, "step": 2246 }, { "epoch": 0.2858414959928762, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.944469451904297e-05, "grad_norm": 22.556243896484375, "learning_rate": 9.520983467571005e-07, "loss": 0.5308, "mean_token_accuracy": 0.8384844064712524, "num_tokens": 85893746.0, "step": 2247 }, { "epoch": 0.28596870627146675, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9325485229492188e-05, "grad_norm": 22.445810317993164, "learning_rate": 9.525222551928783e-07, "loss": 0.5738, "mean_token_accuracy": 0.8323557376861572, "num_tokens": 85932798.0, "step": 2248 }, { "epoch": 0.2860959165500572, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.968311309814453e-05, "grad_norm": 22.625444412231445, "learning_rate": 9.529461636286562e-07, "loss": 0.4857, "mean_token_accuracy": 0.8509141206741333, "num_tokens": 85964446.0, "step": 2249 }, { "epoch": 0.28622312682864776, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9325485229492188e-05, "grad_norm": 22.34233856201172, "learning_rate": 9.533700720644341e-07, "loss": 0.462, "mean_token_accuracy": 0.8593546748161316, "num_tokens": 86000587.0, "step": 2250 }, { "epoch": 0.2863503371072383, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 2.9325485229492188e-05, "grad_norm": 22.751873016357422, "learning_rate": 9.537939805002118e-07, "loss": 0.5148, "mean_token_accuracy": 0.8433433771133423, "num_tokens": 86042891.0, "step": 2251 }, { "epoch": 0.28647754738582876, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.447547912597656, "learning_rate": 9.542178889359898e-07, "loss": 0.4907, "mean_token_accuracy": 0.8508255481719971, "num_tokens": 86079973.0, "step": 2252 }, { "epoch": 0.2866047576644193, "ewc_loss": 0.037841796875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.944469451904297e-05, "grad_norm": 22.624465942382812, "learning_rate": 9.546417973717677e-07, "loss": 0.5148, "mean_token_accuracy": 0.8442997932434082, "num_tokens": 86119767.0, "step": 2253 }, { "epoch": 0.2867319679430098, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9325485229492188e-05, "grad_norm": 22.5270938873291, "learning_rate": 9.550657058075455e-07, "loss": 0.5359, "mean_token_accuracy": 0.8385356664657593, "num_tokens": 86163176.0, "step": 2254 }, { "epoch": 0.2868591782216003, "ewc_loss": 0.037841796875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.944469451904297e-05, "grad_norm": 22.715728759765625, "learning_rate": 9.554896142433234e-07, "loss": 0.5322, "mean_token_accuracy": 0.836340069770813, "num_tokens": 86201523.0, "step": 2255 }, { "epoch": 0.2869863885001908, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9206275939941406e-05, "grad_norm": 22.399961471557617, "learning_rate": 9.559135226791012e-07, "loss": 0.549, "mean_token_accuracy": 0.8290702104568481, "num_tokens": 86240277.0, "step": 2256 }, { "epoch": 0.28711359877878134, "ewc_loss": 0.03759765625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9325485229492188e-05, "grad_norm": 22.311691284179688, "learning_rate": 9.563374311148793e-07, "loss": 0.5393, "mean_token_accuracy": 0.8369151949882507, "num_tokens": 86289714.0, "step": 2257 }, { "epoch": 0.2872408090573718, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.968311309814453e-05, "grad_norm": 22.646093368530273, "learning_rate": 9.56761339550657e-07, "loss": 0.49, "mean_token_accuracy": 0.8524388074874878, "num_tokens": 86320786.0, "step": 2258 }, { "epoch": 0.28736801933596234, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.968311309814453e-05, "grad_norm": 22.505075454711914, "learning_rate": 9.57185247986435e-07, "loss": 0.481, "mean_token_accuracy": 0.8535352945327759, "num_tokens": 86358002.0, "step": 2259 }, { "epoch": 0.28749522961455287, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9802322387695312e-05, "grad_norm": 22.2388858795166, "learning_rate": 9.576091564222128e-07, "loss": 0.4492, "mean_token_accuracy": 0.8620883226394653, "num_tokens": 86395334.0, "step": 2260 }, { "epoch": 0.28762243989314334, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.968311309814453e-05, "grad_norm": 22.450550079345703, "learning_rate": 9.580330648579906e-07, "loss": 0.5263, "mean_token_accuracy": 0.8365120887756348, "num_tokens": 86432616.0, "step": 2261 }, { "epoch": 0.2877496501717339, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0159950256347656e-05, "grad_norm": 22.675188064575195, "learning_rate": 9.584569732937685e-07, "loss": 0.5201, "mean_token_accuracy": 0.8399651646614075, "num_tokens": 86472293.0, "step": 2262 }, { "epoch": 0.2878768604503244, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9921531677246094e-05, "grad_norm": 22.243070602416992, "learning_rate": 9.588808817295463e-07, "loss": 0.5136, "mean_token_accuracy": 0.8399814367294312, "num_tokens": 86506304.0, "step": 2263 }, { "epoch": 0.2880040707289149, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0279159545898438e-05, "grad_norm": 22.79641342163086, "learning_rate": 9.593047901653242e-07, "loss": 0.5098, "mean_token_accuracy": 0.843640923500061, "num_tokens": 86540520.0, "step": 2264 }, { "epoch": 0.2881312810075054, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9921531677246094e-05, "grad_norm": 22.255041122436523, "learning_rate": 9.597286986011022e-07, "loss": 0.4702, "mean_token_accuracy": 0.8555666208267212, "num_tokens": 86576956.0, "step": 2265 }, { "epoch": 0.28825849128609593, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0040740966796875e-05, "grad_norm": 22.890262603759766, "learning_rate": 9.601526070368799e-07, "loss": 0.5557, "mean_token_accuracy": 0.8313126564025879, "num_tokens": 86613799.0, "step": 2266 }, { "epoch": 0.2883857015646864, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0040740966796875e-05, "grad_norm": 22.423229217529297, "learning_rate": 9.60576515472658e-07, "loss": 0.4919, "mean_token_accuracy": 0.8503068685531616, "num_tokens": 86650338.0, "step": 2267 }, { "epoch": 0.28851291184327693, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0159950256347656e-05, "grad_norm": 22.703725814819336, "learning_rate": 9.610004239084358e-07, "loss": 0.507, "mean_token_accuracy": 0.8440973162651062, "num_tokens": 86691605.0, "step": 2268 }, { "epoch": 0.28864012212186746, "ewc_loss": 0.0380859375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.968311309814453e-05, "grad_norm": 22.62053871154785, "learning_rate": 9.614243323442136e-07, "loss": 0.574, "mean_token_accuracy": 0.8226651549339294, "num_tokens": 86728767.0, "step": 2269 }, { "epoch": 0.28876733240045793, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0159950256347656e-05, "grad_norm": 22.492795944213867, "learning_rate": 9.618482407799915e-07, "loss": 0.4879, "mean_token_accuracy": 0.8493164777755737, "num_tokens": 86765240.0, "step": 2270 }, { "epoch": 0.28889454267904846, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9921531677246094e-05, "grad_norm": 22.57959747314453, "learning_rate": 9.622721492157693e-07, "loss": 0.4541, "mean_token_accuracy": 0.8623318076133728, "num_tokens": 86804042.0, "step": 2271 }, { "epoch": 0.289021752957639, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9921531677246094e-05, "grad_norm": 22.677082061767578, "learning_rate": 9.626960576515472e-07, "loss": 0.5069, "mean_token_accuracy": 0.8442193269729614, "num_tokens": 86835220.0, "step": 2272 }, { "epoch": 0.28914896323622946, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.039836883544922e-05, "grad_norm": 22.721240997314453, "learning_rate": 9.63119966087325e-07, "loss": 0.4917, "mean_token_accuracy": 0.8481829762458801, "num_tokens": 86873414.0, "step": 2273 }, { "epoch": 0.28927617351482, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 2.9921531677246094e-05, "grad_norm": 22.520925521850586, "learning_rate": 9.635438745231029e-07, "loss": 0.4801, "mean_token_accuracy": 0.8540780544281006, "num_tokens": 86914272.0, "step": 2274 }, { "epoch": 0.2894033837934105, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.039836883544922e-05, "grad_norm": 22.65674591064453, "learning_rate": 9.63967782958881e-07, "loss": 0.5236, "mean_token_accuracy": 0.8378487229347229, "num_tokens": 86952425.0, "step": 2275 }, { "epoch": 0.289530594072001, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.039836883544922e-05, "grad_norm": 22.54839515686035, "learning_rate": 9.643916913946588e-07, "loss": 0.482, "mean_token_accuracy": 0.8549755215644836, "num_tokens": 86987842.0, "step": 2276 }, { "epoch": 0.2896578043505915, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0279159545898438e-05, "grad_norm": 22.86729621887207, "learning_rate": 9.648155998304366e-07, "loss": 0.4534, "mean_token_accuracy": 0.8625339269638062, "num_tokens": 87026750.0, "step": 2277 }, { "epoch": 0.28978501462918205, "ewc_loss": 0.038330078125, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0040740966796875e-05, "grad_norm": 22.65812110900879, "learning_rate": 9.652395082662145e-07, "loss": 0.5579, "mean_token_accuracy": 0.8286803960800171, "num_tokens": 87069685.0, "step": 2278 }, { "epoch": 0.2899122249077726, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.496139526367188, "learning_rate": 9.656634167019923e-07, "loss": 0.5146, "mean_token_accuracy": 0.8393505811691284, "num_tokens": 87107054.0, "step": 2279 }, { "epoch": 0.29003943518636305, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0517578125e-05, "grad_norm": 22.888540267944336, "learning_rate": 9.660873251377701e-07, "loss": 0.5204, "mean_token_accuracy": 0.8401393890380859, "num_tokens": 87146254.0, "step": 2280 }, { "epoch": 0.2901666454649536, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0159950256347656e-05, "grad_norm": 22.432987213134766, "learning_rate": 9.66511233573548e-07, "loss": 0.546, "mean_token_accuracy": 0.8376563191413879, "num_tokens": 87182404.0, "step": 2281 }, { "epoch": 0.2902938557435441, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.690139770507812, "learning_rate": 9.669351420093258e-07, "loss": 0.4765, "mean_token_accuracy": 0.8546922206878662, "num_tokens": 87216551.0, "step": 2282 }, { "epoch": 0.2904210660221346, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0517578125e-05, "grad_norm": 22.648103713989258, "learning_rate": 9.67359050445104e-07, "loss": 0.5035, "mean_token_accuracy": 0.8467118144035339, "num_tokens": 87249402.0, "step": 2283 }, { "epoch": 0.2905482763007251, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0279159545898438e-05, "grad_norm": 22.338481903076172, "learning_rate": 9.677829588808817e-07, "loss": 0.4878, "mean_token_accuracy": 0.8481203317642212, "num_tokens": 87281846.0, "step": 2284 }, { "epoch": 0.29067548657931563, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0517578125e-05, "grad_norm": 22.428678512573242, "learning_rate": 9.682068673166596e-07, "loss": 0.5604, "mean_token_accuracy": 0.833686113357544, "num_tokens": 87315765.0, "step": 2285 }, { "epoch": 0.2908026968579061, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.55902671813965, "learning_rate": 9.686307757524374e-07, "loss": 0.5009, "mean_token_accuracy": 0.8458212614059448, "num_tokens": 87353668.0, "step": 2286 }, { "epoch": 0.29092990713649663, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.420299530029297, "learning_rate": 9.690546841882153e-07, "loss": 0.4313, "mean_token_accuracy": 0.8676536083221436, "num_tokens": 87390747.0, "step": 2287 }, { "epoch": 0.29105711741508716, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.63963508605957, "learning_rate": 9.694785926239931e-07, "loss": 0.5147, "mean_token_accuracy": 0.8463674783706665, "num_tokens": 87430686.0, "step": 2288 }, { "epoch": 0.29118432769367764, "ewc_loss": 0.03857421875, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0279159545898438e-05, "grad_norm": 22.258808135986328, "learning_rate": 9.69902501059771e-07, "loss": 0.5553, "mean_token_accuracy": 0.830726146697998, "num_tokens": 87465290.0, "step": 2289 }, { "epoch": 0.29131153797226816, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.73785972595215, "learning_rate": 9.703264094955488e-07, "loss": 0.5241, "mean_token_accuracy": 0.8405033349990845, "num_tokens": 87504553.0, "step": 2290 }, { "epoch": 0.2914387482508587, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0517578125e-05, "grad_norm": 22.283510208129883, "learning_rate": 9.707503179313269e-07, "loss": 0.4918, "mean_token_accuracy": 0.8514312505722046, "num_tokens": 87544896.0, "step": 2291 }, { "epoch": 0.29156595852944917, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.87533187866211, "learning_rate": 9.711742263671047e-07, "loss": 0.4923, "mean_token_accuracy": 0.8504536747932434, "num_tokens": 87582075.0, "step": 2292 }, { "epoch": 0.2916931688080397, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0517578125e-05, "grad_norm": 22.232513427734375, "learning_rate": 9.715981348028826e-07, "loss": 0.4853, "mean_token_accuracy": 0.8517014980316162, "num_tokens": 87623117.0, "step": 2293 }, { "epoch": 0.2918203790866302, "ewc_loss": 0.039306640625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.853225708007812, "learning_rate": 9.720220432386604e-07, "loss": 0.5019, "mean_token_accuracy": 0.8487919569015503, "num_tokens": 87656263.0, "step": 2294 }, { "epoch": 0.2919475893652207, "ewc_loss": 0.038818359375, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.039836883544922e-05, "grad_norm": 22.310693740844727, "learning_rate": 9.724459516744383e-07, "loss": 0.4731, "mean_token_accuracy": 0.8568810820579529, "num_tokens": 87695852.0, "step": 2295 }, { "epoch": 0.2920747996438112, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.489639282226562, "learning_rate": 9.728698601102161e-07, "loss": 0.4735, "mean_token_accuracy": 0.8563942909240723, "num_tokens": 87734331.0, "step": 2296 }, { "epoch": 0.29220200992240175, "ewc_loss": 0.039306640625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.726686477661133, "learning_rate": 9.73293768545994e-07, "loss": 0.5065, "mean_token_accuracy": 0.8409911394119263, "num_tokens": 87768447.0, "step": 2297 }, { "epoch": 0.2923292202009922, "ewc_loss": 0.039306640625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.27037239074707, "learning_rate": 9.737176769817718e-07, "loss": 0.5349, "mean_token_accuracy": 0.8425627946853638, "num_tokens": 87813213.0, "step": 2298 }, { "epoch": 0.29245643047958275, "ewc_loss": 0.039306640625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.78411293029785, "learning_rate": 9.741415854175499e-07, "loss": 0.5713, "mean_token_accuracy": 0.8305132389068604, "num_tokens": 87851336.0, "step": 2299 }, { "epoch": 0.2925836407581733, "ewc_loss": 0.0390625, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.075599670410156e-05, "grad_norm": 22.22186279296875, "learning_rate": 9.745654938533277e-07, "loss": 0.5251, "mean_token_accuracy": 0.8395199775695801, "num_tokens": 87888474.0, "step": 2300 }, { "epoch": 0.29271085103676375, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.748077392578125, "learning_rate": 9.749894022891056e-07, "loss": 0.5494, "mean_token_accuracy": 0.8349564671516418, "num_tokens": 87928232.0, "step": 2301 }, { "epoch": 0.2928380613153543, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.4388370513916, "learning_rate": 9.754133107248834e-07, "loss": 0.4657, "mean_token_accuracy": 0.8570910692214966, "num_tokens": 87966982.0, "step": 2302 }, { "epoch": 0.2929652715939448, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.75072479248047, "learning_rate": 9.758372191606612e-07, "loss": 0.554, "mean_token_accuracy": 0.8308616876602173, "num_tokens": 88003550.0, "step": 2303 }, { "epoch": 0.2930924818725353, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.53420066833496, "learning_rate": 9.76261127596439e-07, "loss": 0.4633, "mean_token_accuracy": 0.8605437278747559, "num_tokens": 88038038.0, "step": 2304 }, { "epoch": 0.2932196921511258, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.58075714111328, "learning_rate": 9.76685036032217e-07, "loss": 0.5367, "mean_token_accuracy": 0.8339504599571228, "num_tokens": 88074627.0, "step": 2305 }, { "epoch": 0.29334690242971634, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.77044105529785, "learning_rate": 9.771089444679948e-07, "loss": 0.4779, "mean_token_accuracy": 0.8545122146606445, "num_tokens": 88116772.0, "step": 2306 }, { "epoch": 0.2934741127083068, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.78946876525879, "learning_rate": 9.775328529037728e-07, "loss": 0.5236, "mean_token_accuracy": 0.8390671610832214, "num_tokens": 88152888.0, "step": 2307 }, { "epoch": 0.29360132298689734, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.583087921142578, "learning_rate": 9.779567613395507e-07, "loss": 0.5538, "mean_token_accuracy": 0.8352298140525818, "num_tokens": 88184544.0, "step": 2308 }, { "epoch": 0.29372853326548787, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.454111099243164, "learning_rate": 9.783806697753285e-07, "loss": 0.4968, "mean_token_accuracy": 0.8504666090011597, "num_tokens": 88221603.0, "step": 2309 }, { "epoch": 0.29385574354407834, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.721372604370117, "learning_rate": 9.788045782111064e-07, "loss": 0.4713, "mean_token_accuracy": 0.8558316230773926, "num_tokens": 88265109.0, "step": 2310 }, { "epoch": 0.29398295382266887, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.00334930419922, "learning_rate": 9.792284866468842e-07, "loss": 0.4608, "mean_token_accuracy": 0.8586195707321167, "num_tokens": 88302756.0, "step": 2311 }, { "epoch": 0.2941101641012594, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.573486328125, "learning_rate": 9.79652395082662e-07, "loss": 0.4616, "mean_token_accuracy": 0.8545860648155212, "num_tokens": 88335605.0, "step": 2312 }, { "epoch": 0.29423737437984987, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.88934326171875, "learning_rate": 9.8007630351844e-07, "loss": 0.5308, "mean_token_accuracy": 0.8406825065612793, "num_tokens": 88369447.0, "step": 2313 }, { "epoch": 0.2943645846584404, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.878171920776367, "learning_rate": 9.805002119542178e-07, "loss": 0.5616, "mean_token_accuracy": 0.830294132232666, "num_tokens": 88404442.0, "step": 2314 }, { "epoch": 0.2944917949370309, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.884714126586914, "learning_rate": 9.809241203899958e-07, "loss": 0.5063, "mean_token_accuracy": 0.8478610515594482, "num_tokens": 88445870.0, "step": 2315 }, { "epoch": 0.2946190052156214, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.712696075439453, "learning_rate": 9.813480288257737e-07, "loss": 0.5209, "mean_token_accuracy": 0.8374295234680176, "num_tokens": 88480994.0, "step": 2316 }, { "epoch": 0.2947462154942119, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.739185333251953, "learning_rate": 9.817719372615515e-07, "loss": 0.4746, "mean_token_accuracy": 0.856113612651825, "num_tokens": 88519666.0, "step": 2317 }, { "epoch": 0.29487342577280246, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.435888290405273, "learning_rate": 9.821958456973294e-07, "loss": 0.4501, "mean_token_accuracy": 0.8612621426582336, "num_tokens": 88553855.0, "step": 2318 }, { "epoch": 0.29500063605139293, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.93784523010254, "learning_rate": 9.826197541331072e-07, "loss": 0.4659, "mean_token_accuracy": 0.8594436645507812, "num_tokens": 88589053.0, "step": 2319 }, { "epoch": 0.29512784632998346, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.91827392578125, "learning_rate": 9.83043662568885e-07, "loss": 0.5247, "mean_token_accuracy": 0.8418201208114624, "num_tokens": 88627067.0, "step": 2320 }, { "epoch": 0.295255056608574, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.865808486938477, "learning_rate": 9.83467571004663e-07, "loss": 0.4675, "mean_token_accuracy": 0.8570048809051514, "num_tokens": 88668030.0, "step": 2321 }, { "epoch": 0.29538226688716446, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.6602725982666, "learning_rate": 9.838914794404407e-07, "loss": 0.4677, "mean_token_accuracy": 0.8593511581420898, "num_tokens": 88708889.0, "step": 2322 }, { "epoch": 0.295509477165755, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.30573081970215, "learning_rate": 9.843153878762188e-07, "loss": 0.5338, "mean_token_accuracy": 0.8399317860603333, "num_tokens": 88745689.0, "step": 2323 }, { "epoch": 0.2956366874443455, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.513675689697266, "learning_rate": 9.847392963119966e-07, "loss": 0.4846, "mean_token_accuracy": 0.8526980876922607, "num_tokens": 88784667.0, "step": 2324 }, { "epoch": 0.295763897722936, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.04412269592285, "learning_rate": 9.851632047477745e-07, "loss": 0.4986, "mean_token_accuracy": 0.8471208214759827, "num_tokens": 88827334.0, "step": 2325 }, { "epoch": 0.2958911080015265, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.96282958984375, "learning_rate": 9.855871131835523e-07, "loss": 0.4317, "mean_token_accuracy": 0.8670903444290161, "num_tokens": 88863744.0, "step": 2326 }, { "epoch": 0.29601831828011704, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.700937271118164, "learning_rate": 9.860110216193302e-07, "loss": 0.4645, "mean_token_accuracy": 0.8602355718612671, "num_tokens": 88903693.0, "step": 2327 }, { "epoch": 0.2961455285587075, "ewc_loss": 0.03955078125, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.0994415283203125e-05, "grad_norm": 22.905372619628906, "learning_rate": 9.86434930055108e-07, "loss": 0.4599, "mean_token_accuracy": 0.8590079545974731, "num_tokens": 88936668.0, "step": 2328 }, { "epoch": 0.29627273883729804, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.94535255432129, "learning_rate": 9.868588384908859e-07, "loss": 0.4696, "mean_token_accuracy": 0.8556994199752808, "num_tokens": 88970920.0, "step": 2329 }, { "epoch": 0.2963999491158886, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.059213638305664, "learning_rate": 9.872827469266637e-07, "loss": 0.4719, "mean_token_accuracy": 0.8541743755340576, "num_tokens": 89010856.0, "step": 2330 }, { "epoch": 0.2965271593944791, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.028810501098633, "learning_rate": 9.877066553624418e-07, "loss": 0.4816, "mean_token_accuracy": 0.8536537885665894, "num_tokens": 89047109.0, "step": 2331 }, { "epoch": 0.2966543696730696, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.78664207458496, "learning_rate": 9.881305637982196e-07, "loss": 0.4908, "mean_token_accuracy": 0.8520548939704895, "num_tokens": 89087859.0, "step": 2332 }, { "epoch": 0.2967815799516601, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.214311599731445, "learning_rate": 9.885544722339975e-07, "loss": 0.4736, "mean_token_accuracy": 0.8538403511047363, "num_tokens": 89123601.0, "step": 2333 }, { "epoch": 0.29690879023025063, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.71080207824707, "learning_rate": 9.889783806697753e-07, "loss": 0.4545, "mean_token_accuracy": 0.8637988567352295, "num_tokens": 89160133.0, "step": 2334 }, { "epoch": 0.2970360005088411, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.27067756652832, "learning_rate": 9.894022891055532e-07, "loss": 0.4476, "mean_token_accuracy": 0.8652194142341614, "num_tokens": 89196731.0, "step": 2335 }, { "epoch": 0.29716321078743163, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.05976676940918, "learning_rate": 9.89826197541331e-07, "loss": 0.4376, "mean_token_accuracy": 0.8653079271316528, "num_tokens": 89230159.0, "step": 2336 }, { "epoch": 0.29729042106602216, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.810073852539062, "learning_rate": 9.902501059771089e-07, "loss": 0.5019, "mean_token_accuracy": 0.8459472060203552, "num_tokens": 89264648.0, "step": 2337 }, { "epoch": 0.29741763134461263, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.178464889526367, "learning_rate": 9.906740144128867e-07, "loss": 0.5041, "mean_token_accuracy": 0.8435282707214355, "num_tokens": 89302505.0, "step": 2338 }, { "epoch": 0.29754484162320316, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.89607810974121, "learning_rate": 9.910979228486648e-07, "loss": 0.4617, "mean_token_accuracy": 0.8575180768966675, "num_tokens": 89338079.0, "step": 2339 }, { "epoch": 0.2976720519017937, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.84204864501953, "learning_rate": 9.915218312844426e-07, "loss": 0.5278, "mean_token_accuracy": 0.8442310094833374, "num_tokens": 89376717.0, "step": 2340 }, { "epoch": 0.29779926218038416, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.96478843688965, "learning_rate": 9.919457397202205e-07, "loss": 0.5162, "mean_token_accuracy": 0.8419770002365112, "num_tokens": 89416164.0, "step": 2341 }, { "epoch": 0.2979264724589747, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.04722785949707, "learning_rate": 9.923696481559983e-07, "loss": 0.4671, "mean_token_accuracy": 0.8576298952102661, "num_tokens": 89446491.0, "step": 2342 }, { "epoch": 0.2980536827375652, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.85582733154297, "learning_rate": 9.927935565917761e-07, "loss": 0.5166, "mean_token_accuracy": 0.8408567905426025, "num_tokens": 89483637.0, "step": 2343 }, { "epoch": 0.2981808930161557, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.52346420288086e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.203536987304688, "learning_rate": 9.93217465027554e-07, "loss": 0.5089, "mean_token_accuracy": 0.8486748337745667, "num_tokens": 89526532.0, "step": 2344 }, { "epoch": 0.2983081032947462, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.78141212463379, "learning_rate": 9.936413734633318e-07, "loss": 0.5727, "mean_token_accuracy": 0.8310237526893616, "num_tokens": 89558353.0, "step": 2345 }, { "epoch": 0.29843531357333675, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.944107055664062, "learning_rate": 9.940652818991097e-07, "loss": 0.4641, "mean_token_accuracy": 0.8596096038818359, "num_tokens": 89598727.0, "step": 2346 }, { "epoch": 0.2985625238519272, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.772533416748047, "learning_rate": 9.944891903348877e-07, "loss": 0.5326, "mean_token_accuracy": 0.8383680582046509, "num_tokens": 89640093.0, "step": 2347 }, { "epoch": 0.29868973413051775, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.736995697021484, "learning_rate": 9.949130987706656e-07, "loss": 0.4766, "mean_token_accuracy": 0.854863166809082, "num_tokens": 89681401.0, "step": 2348 }, { "epoch": 0.2988169444091083, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.865781784057617, "learning_rate": 9.953370072064432e-07, "loss": 0.4697, "mean_token_accuracy": 0.8562837839126587, "num_tokens": 89720366.0, "step": 2349 }, { "epoch": 0.29894415468769875, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.9375057220459, "learning_rate": 9.957609156422213e-07, "loss": 0.5469, "mean_token_accuracy": 0.8337201476097107, "num_tokens": 89762234.0, "step": 2350 }, { "epoch": 0.2990713649662893, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.979162216186523, "learning_rate": 9.961848240779991e-07, "loss": 0.4944, "mean_token_accuracy": 0.8511952757835388, "num_tokens": 89792208.0, "step": 2351 }, { "epoch": 0.2991985752448798, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.188556671142578, "learning_rate": 9.96608732513777e-07, "loss": 0.469, "mean_token_accuracy": 0.861696720123291, "num_tokens": 89833246.0, "step": 2352 }, { "epoch": 0.2993257855234703, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.031517028808594, "learning_rate": 9.970326409495548e-07, "loss": 0.53, "mean_token_accuracy": 0.8372599482536316, "num_tokens": 89873271.0, "step": 2353 }, { "epoch": 0.2994529958020608, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.382488250732422, "learning_rate": 9.974565493853327e-07, "loss": 0.5455, "mean_token_accuracy": 0.8332768082618713, "num_tokens": 89913069.0, "step": 2354 }, { "epoch": 0.29958020608065133, "ewc_loss": 0.039794921875, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.008602142333984, "learning_rate": 9.978804578211107e-07, "loss": 0.5235, "mean_token_accuracy": 0.8408539891242981, "num_tokens": 89948915.0, "step": 2355 }, { "epoch": 0.2997074163592418, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.248838424682617, "learning_rate": 9.983043662568886e-07, "loss": 0.5227, "mean_token_accuracy": 0.8426216840744019, "num_tokens": 89982123.0, "step": 2356 }, { "epoch": 0.29983462663783234, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.252288818359375, "learning_rate": 9.987282746926662e-07, "loss": 0.4709, "mean_token_accuracy": 0.8574405908584595, "num_tokens": 90018259.0, "step": 2357 }, { "epoch": 0.29996183691642286, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.16818618774414, "learning_rate": 9.991521831284443e-07, "loss": 0.4956, "mean_token_accuracy": 0.8507956266403198, "num_tokens": 90059041.0, "step": 2358 }, { "epoch": 0.30008904719501334, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.08859634399414, "learning_rate": 9.995760915642221e-07, "loss": 0.4428, "mean_token_accuracy": 0.8641015291213989, "num_tokens": 90089463.0, "step": 2359 }, { "epoch": 0.30021625747360386, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.439035415649414, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8583509922027588, "num_tokens": 90125691.0, "step": 2360 }, { "epoch": 0.3003434677521944, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.858789443969727, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8462332487106323, "num_tokens": 90161778.0, "step": 2361 }, { "epoch": 0.30047067803078487, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.549560546875, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8339121341705322, "num_tokens": 90202243.0, "step": 2362 }, { "epoch": 0.3005978883093754, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.82636070251465, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8414322137832642, "num_tokens": 90248924.0, "step": 2363 }, { "epoch": 0.3007250985879659, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.565147399902344, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8194727897644043, "num_tokens": 90281988.0, "step": 2364 }, { "epoch": 0.3008523088665564, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.178253173828125, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8407225608825684, "num_tokens": 90317050.0, "step": 2365 }, { "epoch": 0.3009795191451469, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.20426368713379, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8566710948944092, "num_tokens": 90358064.0, "step": 2366 }, { "epoch": 0.30110672942373745, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.190841674804688, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.837525486946106, "num_tokens": 90400692.0, "step": 2367 }, { "epoch": 0.3012339397023279, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.010622024536133, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8402944803237915, "num_tokens": 90432207.0, "step": 2368 }, { "epoch": 0.30136114998091845, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.34665870666504, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8453048467636108, "num_tokens": 90466081.0, "step": 2369 }, { "epoch": 0.301488360259509, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.38307762145996, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.854422390460968, "num_tokens": 90501963.0, "step": 2370 }, { "epoch": 0.30161557053809945, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.999208450317383, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8591831922531128, "num_tokens": 90540651.0, "step": 2371 }, { "epoch": 0.30174278081669, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.14690589904785, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8399043083190918, "num_tokens": 90585105.0, "step": 2372 }, { "epoch": 0.3018699910952805, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.292484283447266, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8449124097824097, "num_tokens": 90626773.0, "step": 2373 }, { "epoch": 0.301997201373871, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.163612365722656, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8624985218048096, "num_tokens": 90665974.0, "step": 2374 }, { "epoch": 0.3021244116524615, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.100494384765625, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8442785143852234, "num_tokens": 90706801.0, "step": 2375 }, { "epoch": 0.30225162193105204, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.218017578125, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8442431688308716, "num_tokens": 90743635.0, "step": 2376 }, { "epoch": 0.3023788322096425, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.060338973999023, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8480802178382874, "num_tokens": 90782797.0, "step": 2377 }, { "epoch": 0.30250604248823304, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.120044708251953, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8402807712554932, "num_tokens": 90822661.0, "step": 2378 }, { "epoch": 0.30263325276682357, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.010774612426758, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8303776979446411, "num_tokens": 90857804.0, "step": 2379 }, { "epoch": 0.3027604630454141, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.1411190032959, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8449239134788513, "num_tokens": 90895778.0, "step": 2380 }, { "epoch": 0.30288767332400457, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.055273056030273, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8540754318237305, "num_tokens": 90934463.0, "step": 2381 }, { "epoch": 0.3030148836025951, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.082618713378906, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8519461154937744, "num_tokens": 90967888.0, "step": 2382 }, { "epoch": 0.3031420938811856, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.07940101623535, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.855048418045044, "num_tokens": 91005200.0, "step": 2383 }, { "epoch": 0.3032693041597761, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.263164520263672, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8315680027008057, "num_tokens": 91037840.0, "step": 2384 }, { "epoch": 0.3033965144383666, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.176015853881836, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8331310153007507, "num_tokens": 91073336.0, "step": 2385 }, { "epoch": 0.30352372471695716, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.178478240966797, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.849114179611206, "num_tokens": 91107001.0, "step": 2386 }, { "epoch": 0.30365093499554763, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.273473739624023, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8544274568557739, "num_tokens": 91143937.0, "step": 2387 }, { "epoch": 0.30377814527413816, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.28823471069336, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.83788001537323, "num_tokens": 91185391.0, "step": 2388 }, { "epoch": 0.3039053555527287, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.078031539916992, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8467708826065063, "num_tokens": 91227653.0, "step": 2389 }, { "epoch": 0.30403256583131916, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.470407485961914, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8669110536575317, "num_tokens": 91263610.0, "step": 2390 }, { "epoch": 0.3041597761099097, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.35702896118164, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8424023985862732, "num_tokens": 91296880.0, "step": 2391 }, { "epoch": 0.3042869863885002, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.15654754638672, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.83542400598526, "num_tokens": 91336242.0, "step": 2392 }, { "epoch": 0.3044141966670907, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 24.357524871826172, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8553017973899841, "num_tokens": 91376527.0, "step": 2393 }, { "epoch": 0.3045414069456812, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.909563064575195, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8378883600234985, "num_tokens": 91408415.0, "step": 2394 }, { "epoch": 0.30466861722427174, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.66718864440918, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8485141396522522, "num_tokens": 91449189.0, "step": 2395 }, { "epoch": 0.3047958275028622, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 24.534374237060547, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8436713218688965, "num_tokens": 91488599.0, "step": 2396 }, { "epoch": 0.30492303778145274, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.281341552734375, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8527884483337402, "num_tokens": 91525138.0, "step": 2397 }, { "epoch": 0.30505024806004327, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.309751510620117, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8486368060112, "num_tokens": 91567444.0, "step": 2398 }, { "epoch": 0.30517745833863374, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.63280487060547, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.845689058303833, "num_tokens": 91602926.0, "step": 2399 }, { "epoch": 0.3053046686172243, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 22.965028762817383, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8596674799919128, "num_tokens": 91635901.0, "step": 2400 }, { "epoch": 0.3054318788958148, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.461658477783203, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.85027676820755, "num_tokens": 91679542.0, "step": 2401 }, { "epoch": 0.3055590891744053, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.282127380371094, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8430227637290955, "num_tokens": 91723022.0, "step": 2402 }, { "epoch": 0.3056862994529958, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.451309204101562, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8438037633895874, "num_tokens": 91760664.0, "step": 2403 }, { "epoch": 0.30581350973158633, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.067480087280273, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8442468643188477, "num_tokens": 91805570.0, "step": 2404 }, { "epoch": 0.3059407200101768, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.47634506225586, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8391588926315308, "num_tokens": 91843342.0, "step": 2405 }, { "epoch": 0.30606793028876733, "ewc_loss": 0.040283203125, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.252967834472656, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.838525116443634, "num_tokens": 91883872.0, "step": 2406 }, { "epoch": 0.30619514056735786, "ewc_loss": 0.040771484375, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.966388702392578, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8468536138534546, "num_tokens": 91923174.0, "step": 2407 }, { "epoch": 0.30632235084594833, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.471712112426758, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8473837375640869, "num_tokens": 91963462.0, "step": 2408 }, { "epoch": 0.30644956112453886, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.47303581237793, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8508896231651306, "num_tokens": 92000907.0, "step": 2409 }, { "epoch": 0.3065767714031294, "ewc_loss": 0.0400390625, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 23.489057540893555, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8504296541213989, "num_tokens": 92036110.0, "step": 2410 }, { "epoch": 0.30670398168171986, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.171897888183594, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8434863686561584, "num_tokens": 92080647.0, "step": 2411 }, { "epoch": 0.3068311919603104, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.82008934020996, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8516863584518433, "num_tokens": 92115407.0, "step": 2412 }, { "epoch": 0.3069584022389009, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.221532821655273, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8473536968231201, "num_tokens": 92149660.0, "step": 2413 }, { "epoch": 0.3070856125174914, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.417068481445312, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8507363200187683, "num_tokens": 92191581.0, "step": 2414 }, { "epoch": 0.3072128227960819, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.592100143432617, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8430825471878052, "num_tokens": 92230435.0, "step": 2415 }, { "epoch": 0.30734003307467245, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.413288116455078, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8363869190216064, "num_tokens": 92269545.0, "step": 2416 }, { "epoch": 0.3074672433532629, "ewc_loss": 0.04052734375, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.55979347229004, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8546566963195801, "num_tokens": 92307447.0, "step": 2417 }, { "epoch": 0.30759445363185345, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.391942977905273, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8462055325508118, "num_tokens": 92349441.0, "step": 2418 }, { "epoch": 0.307721663910444, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 24.209125518798828, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.851692259311676, "num_tokens": 92380405.0, "step": 2419 }, { "epoch": 0.30784887418903445, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.20842933654785, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8461223244667053, "num_tokens": 92419596.0, "step": 2420 }, { "epoch": 0.307976084467625, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 24.936683654785156, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8397919535636902, "num_tokens": 92454590.0, "step": 2421 }, { "epoch": 0.3081032947462155, "ewc_loss": 0.040283203125, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.123283386230469e-05, "grad_norm": 22.915441513061523, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8431795835494995, "num_tokens": 92491640.0, "step": 2422 }, { "epoch": 0.308230505024806, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 24.385639190673828, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8396594524383545, "num_tokens": 92528682.0, "step": 2423 }, { "epoch": 0.3083577153033965, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.360637664794922, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8538451194763184, "num_tokens": 92566368.0, "step": 2424 }, { "epoch": 0.30848492558198704, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.457229614257812, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8259803056716919, "num_tokens": 92601380.0, "step": 2425 }, { "epoch": 0.3086121358605775, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.460201263427734, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.864622950553894, "num_tokens": 92639724.0, "step": 2426 }, { "epoch": 0.30873934613916804, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 24.216337203979492, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8534711599349976, "num_tokens": 92677450.0, "step": 2427 }, { "epoch": 0.30886655641775856, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.54229736328125, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8552964329719543, "num_tokens": 92714681.0, "step": 2428 }, { "epoch": 0.3089937666963491, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.785675048828125, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8448331356048584, "num_tokens": 92749925.0, "step": 2429 }, { "epoch": 0.30912097697493957, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.727243423461914, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.851139485836029, "num_tokens": 92791720.0, "step": 2430 }, { "epoch": 0.3092481872535301, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.531719207763672, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8399888277053833, "num_tokens": 92826179.0, "step": 2431 }, { "epoch": 0.3093753975321206, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.71710968017578, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8476995229721069, "num_tokens": 92872751.0, "step": 2432 }, { "epoch": 0.3095026078107111, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.4788875579834, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8412600755691528, "num_tokens": 92907470.0, "step": 2433 }, { "epoch": 0.3096298180893016, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.37963104248047, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.852523922920227, "num_tokens": 92943321.0, "step": 2434 }, { "epoch": 0.30975702836789215, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.908061981201172, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8487223386764526, "num_tokens": 92982136.0, "step": 2435 }, { "epoch": 0.3098842386464826, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.272903442382812, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8362798690795898, "num_tokens": 93015974.0, "step": 2436 }, { "epoch": 0.31001144892507315, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.494844436645508, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8513156175613403, "num_tokens": 93054010.0, "step": 2437 }, { "epoch": 0.3101386592036637, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.383501052856445, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8552608489990234, "num_tokens": 93090226.0, "step": 2438 }, { "epoch": 0.31026586948225415, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.518014907836914, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8444836735725403, "num_tokens": 93130242.0, "step": 2439 }, { "epoch": 0.3103930797608447, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.29277801513672, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8432109355926514, "num_tokens": 93170990.0, "step": 2440 }, { "epoch": 0.3105202900394352, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.392047882080078, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8399255275726318, "num_tokens": 93205478.0, "step": 2441 }, { "epoch": 0.3106475003180257, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.23818588256836, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.840387225151062, "num_tokens": 93240952.0, "step": 2442 }, { "epoch": 0.3107747105966162, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.434173583984375, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.836579442024231, "num_tokens": 93278882.0, "step": 2443 }, { "epoch": 0.31090192087520674, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.590003967285156, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8398064374923706, "num_tokens": 93324573.0, "step": 2444 }, { "epoch": 0.3110291311537972, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.52130126953125, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8604598045349121, "num_tokens": 93361049.0, "step": 2445 }, { "epoch": 0.31115634143238774, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.494108200073242, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8438601493835449, "num_tokens": 93401748.0, "step": 2446 }, { "epoch": 0.31128355171097827, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.681304931640625, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.859341025352478, "num_tokens": 93434857.0, "step": 2447 }, { "epoch": 0.31141076198956874, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.503175735473633, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8441992998123169, "num_tokens": 93474681.0, "step": 2448 }, { "epoch": 0.31153797226815927, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.766357421875, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8450966477394104, "num_tokens": 93510627.0, "step": 2449 }, { "epoch": 0.3116651825467498, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.574752807617188, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8389772176742554, "num_tokens": 93551598.0, "step": 2450 }, { "epoch": 0.31179239282534027, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.862520217895508, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8399778604507446, "num_tokens": 93586622.0, "step": 2451 }, { "epoch": 0.3119196031039308, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.758554458618164, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8356537818908691, "num_tokens": 93626496.0, "step": 2452 }, { "epoch": 0.3120468133825213, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.65372085571289, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8481671810150146, "num_tokens": 93665641.0, "step": 2453 }, { "epoch": 0.3121740236611118, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.892475128173828, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.851839542388916, "num_tokens": 93702135.0, "step": 2454 }, { "epoch": 0.3123012339397023, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.71587562561035, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8532649278640747, "num_tokens": 93738226.0, "step": 2455 }, { "epoch": 0.31242844421829286, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.698747634887695, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8505041599273682, "num_tokens": 93780273.0, "step": 2456 }, { "epoch": 0.31255565449688333, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.709848403930664, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8347145915031433, "num_tokens": 93821977.0, "step": 2457 }, { "epoch": 0.31268286477547386, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 24.08731460571289, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8558495044708252, "num_tokens": 93856084.0, "step": 2458 }, { "epoch": 0.3128100750540644, "ewc_loss": 0.04052734375, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.147125244140625e-05, "grad_norm": 23.387893676757812, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8499107360839844, "num_tokens": 93893491.0, "step": 2459 }, { "epoch": 0.31293728533265486, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.73685073852539, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8353928923606873, "num_tokens": 93940240.0, "step": 2460 }, { "epoch": 0.3130644956112454, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.5073184967041, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8386520743370056, "num_tokens": 93974504.0, "step": 2461 }, { "epoch": 0.3131917058898359, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.358388900756836, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8496555089950562, "num_tokens": 94009208.0, "step": 2462 }, { "epoch": 0.3133189161684264, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.81137466430664, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.847955584526062, "num_tokens": 94049249.0, "step": 2463 }, { "epoch": 0.3134461264470169, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.68961524963379, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8470642566680908, "num_tokens": 94085553.0, "step": 2464 }, { "epoch": 0.31357333672560744, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.771163940429688, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8271852731704712, "num_tokens": 94127531.0, "step": 2465 }, { "epoch": 0.3137005470041979, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.620729446411133, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8584762811660767, "num_tokens": 94171191.0, "step": 2466 }, { "epoch": 0.31382775728278844, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.78692626953125, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8509078025817871, "num_tokens": 94212090.0, "step": 2467 }, { "epoch": 0.313954967561379, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.438655853271484, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8655438423156738, "num_tokens": 94246835.0, "step": 2468 }, { "epoch": 0.31408217783996945, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.757991790771484, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8456953763961792, "num_tokens": 94284581.0, "step": 2469 }, { "epoch": 0.31420938811856, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 24.403833389282227, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8573859930038452, "num_tokens": 94323050.0, "step": 2470 }, { "epoch": 0.3143365983971505, "ewc_loss": 0.040771484375, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.170967102050781e-05, "grad_norm": 23.432781219482422, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8420674204826355, "num_tokens": 94364702.0, "step": 2471 }, { "epoch": 0.314463808675741, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 24.656099319458008, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8529617786407471, "num_tokens": 94404735.0, "step": 2472 }, { "epoch": 0.3145910189543315, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.483671188354492, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.832146942615509, "num_tokens": 94444683.0, "step": 2473 }, { "epoch": 0.31471822923292203, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 24.390953063964844, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8503180742263794, "num_tokens": 94484392.0, "step": 2474 }, { "epoch": 0.3148454395115125, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 24.854921340942383, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8547909259796143, "num_tokens": 94524860.0, "step": 2475 }, { "epoch": 0.31497264979010303, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.313894271850586, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8444190621376038, "num_tokens": 94565076.0, "step": 2476 }, { "epoch": 0.31509986006869356, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 24.35042953491211, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8248888850212097, "num_tokens": 94606191.0, "step": 2477 }, { "epoch": 0.31522707034728403, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.41579818725586, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8425142168998718, "num_tokens": 94648021.0, "step": 2478 }, { "epoch": 0.31535428062587456, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.92966651916504, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8587310314178467, "num_tokens": 94686305.0, "step": 2479 }, { "epoch": 0.3154814909044651, "ewc_loss": 0.041015625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.865095138549805, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8451732397079468, "num_tokens": 94728606.0, "step": 2480 }, { "epoch": 0.3156087011830556, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.521018981933594, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8500797748565674, "num_tokens": 94770162.0, "step": 2481 }, { "epoch": 0.3157359114616461, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.533660888671875, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8554055094718933, "num_tokens": 94807735.0, "step": 2482 }, { "epoch": 0.3158631217402366, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.710596084594727, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8427466750144958, "num_tokens": 94840366.0, "step": 2483 }, { "epoch": 0.31599033201882715, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 24.106449127197266, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.844426155090332, "num_tokens": 94874293.0, "step": 2484 }, { "epoch": 0.3161175422974176, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.671737670898438, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8465036749839783, "num_tokens": 94909347.0, "step": 2485 }, { "epoch": 0.31624475257600815, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.936416625976562, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8574642539024353, "num_tokens": 94955120.0, "step": 2486 }, { "epoch": 0.3163719628545987, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.205732345581055, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8620235919952393, "num_tokens": 94992319.0, "step": 2487 }, { "epoch": 0.31649917313318915, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.312843322753906, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8565186262130737, "num_tokens": 95023464.0, "step": 2488 }, { "epoch": 0.3166263834117797, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.16640281677246, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8602123856544495, "num_tokens": 95055963.0, "step": 2489 }, { "epoch": 0.3167535936903702, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.98817253112793, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8290396332740784, "num_tokens": 95089628.0, "step": 2490 }, { "epoch": 0.3168808039689607, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 25.349822998046875, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8629987239837646, "num_tokens": 95128557.0, "step": 2491 }, { "epoch": 0.3170080142475512, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 24.11460304260254, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8355781435966492, "num_tokens": 95168324.0, "step": 2492 }, { "epoch": 0.31713522452614173, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 24.0970458984375, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8279709815979004, "num_tokens": 95208139.0, "step": 2493 }, { "epoch": 0.3172624348047322, "ewc_loss": 0.041259765625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.1948089599609375e-05, "grad_norm": 23.430877685546875, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8427520394325256, "num_tokens": 95247337.0, "step": 2494 }, { "epoch": 0.31738964508332274, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.653186798095703, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.842077374458313, "num_tokens": 95289208.0, "step": 2495 }, { "epoch": 0.31751685536191326, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.73828887939453, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8441912531852722, "num_tokens": 95329717.0, "step": 2496 }, { "epoch": 0.31764406564050374, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 24.000890731811523, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8270626068115234, "num_tokens": 95371052.0, "step": 2497 }, { "epoch": 0.31777127591909426, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.21283531188965, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.867514967918396, "num_tokens": 95412486.0, "step": 2498 }, { "epoch": 0.3178984861976848, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.90839385986328, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8645583391189575, "num_tokens": 95446167.0, "step": 2499 }, { "epoch": 0.31802569647627527, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.47629165649414, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8425872921943665, "num_tokens": 95485763.0, "step": 2500 }, { "epoch": 0.3181529067548658, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 24.035097122192383, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8372451066970825, "num_tokens": 95526407.0, "step": 2501 }, { "epoch": 0.3182801170334563, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.524560928344727, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8145604133605957, "num_tokens": 95567479.0, "step": 2502 }, { "epoch": 0.3184073273120468, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 24.02859115600586, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8549272418022156, "num_tokens": 95606588.0, "step": 2503 }, { "epoch": 0.3185345375906373, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.453245162963867, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.85220867395401, "num_tokens": 95653293.0, "step": 2504 }, { "epoch": 0.31866174786922785, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.9929256439209, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8382904529571533, "num_tokens": 95692424.0, "step": 2505 }, { "epoch": 0.3187889581478183, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.85831642150879, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8560234904289246, "num_tokens": 95733679.0, "step": 2506 }, { "epoch": 0.31891616842640885, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.993764877319336, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.842508852481842, "num_tokens": 95768211.0, "step": 2507 }, { "epoch": 0.3190433787049994, "ewc_loss": 0.04150390625, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.218650817871094e-05, "grad_norm": 23.4067440032959, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8452057242393494, "num_tokens": 95805974.0, "step": 2508 }, { "epoch": 0.31917058898358985, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.860767364501953, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8458379507064819, "num_tokens": 95845463.0, "step": 2509 }, { "epoch": 0.3192977992621804, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.797956466674805, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8430190682411194, "num_tokens": 95888148.0, "step": 2510 }, { "epoch": 0.3194250095407709, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.940841674804688, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8691458702087402, "num_tokens": 95925432.0, "step": 2511 }, { "epoch": 0.3195522198193614, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.548133850097656, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8536486625671387, "num_tokens": 95970695.0, "step": 2512 }, { "epoch": 0.3196794300979519, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 24.37670135498047, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8486058115959167, "num_tokens": 96006093.0, "step": 2513 }, { "epoch": 0.31980664037654244, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.35849952697754, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8476426601409912, "num_tokens": 96045945.0, "step": 2514 }, { "epoch": 0.3199338506551329, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.309473037719727, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8628631830215454, "num_tokens": 96076748.0, "step": 2515 }, { "epoch": 0.32006106093372344, "ewc_loss": 0.041748046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.24249267578125e-05, "grad_norm": 23.531169891357422, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8491666316986084, "num_tokens": 96116964.0, "step": 2516 }, { "epoch": 0.32018827121231397, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.711669921875, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8504936695098877, "num_tokens": 96152073.0, "step": 2517 }, { "epoch": 0.32031548149090444, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.741186141967773, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.846508800983429, "num_tokens": 96191608.0, "step": 2518 }, { "epoch": 0.32044269176949497, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.87885284423828, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8657658696174622, "num_tokens": 96228124.0, "step": 2519 }, { "epoch": 0.3205699020480855, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.68402099609375, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8592286109924316, "num_tokens": 96265870.0, "step": 2520 }, { "epoch": 0.32069711232667597, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.970399856567383, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8584814667701721, "num_tokens": 96309623.0, "step": 2521 }, { "epoch": 0.3208243226052665, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.5721492767334, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8409113883972168, "num_tokens": 96344036.0, "step": 2522 }, { "epoch": 0.320951532883857, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.461122512817383, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8597980737686157, "num_tokens": 96377627.0, "step": 2523 }, { "epoch": 0.3210787431624475, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.809526443481445, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8492749333381653, "num_tokens": 96413568.0, "step": 2524 }, { "epoch": 0.32120595344103803, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.462665557861328, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8450859785079956, "num_tokens": 96454682.0, "step": 2525 }, { "epoch": 0.32133316371962856, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.885604858398438, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8374694585800171, "num_tokens": 96490456.0, "step": 2526 }, { "epoch": 0.32146037399821903, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.570039749145508, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8414709568023682, "num_tokens": 96526891.0, "step": 2527 }, { "epoch": 0.32158758427680956, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.932472229003906, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8427051305770874, "num_tokens": 96575226.0, "step": 2528 }, { "epoch": 0.3217147945554001, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.83173370361328, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8383785486221313, "num_tokens": 96606383.0, "step": 2529 }, { "epoch": 0.3218420048339906, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.53780174255371, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8509156703948975, "num_tokens": 96639826.0, "step": 2530 }, { "epoch": 0.3219692151125811, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.649816513061523, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8518943786621094, "num_tokens": 96679151.0, "step": 2531 }, { "epoch": 0.3220964253911716, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.430194854736328, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8550519347190857, "num_tokens": 96719365.0, "step": 2532 }, { "epoch": 0.32222363566976214, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.60628890991211, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8454967737197876, "num_tokens": 96758318.0, "step": 2533 }, { "epoch": 0.3223508459483526, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.6357364654541, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8425024151802063, "num_tokens": 96791159.0, "step": 2534 }, { "epoch": 0.32247805622694314, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.68600845336914, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8512234687805176, "num_tokens": 96825352.0, "step": 2535 }, { "epoch": 0.32260526650553367, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.094327926635742, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8460543155670166, "num_tokens": 96859700.0, "step": 2536 }, { "epoch": 0.32273247678412414, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.497295379638672, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8458384275436401, "num_tokens": 96897077.0, "step": 2537 }, { "epoch": 0.3228596870627147, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.65753936767578, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8419939875602722, "num_tokens": 96936936.0, "step": 2538 }, { "epoch": 0.3229868973413052, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.720426559448242, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8534314036369324, "num_tokens": 96975702.0, "step": 2539 }, { "epoch": 0.3231141076198957, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.60214614868164, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8462748527526855, "num_tokens": 97010585.0, "step": 2540 }, { "epoch": 0.3232413178984862, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.568363189697266, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8718148469924927, "num_tokens": 97047210.0, "step": 2541 }, { "epoch": 0.32336852817707673, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.815895080566406, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8426744341850281, "num_tokens": 97084835.0, "step": 2542 }, { "epoch": 0.3234957384556672, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.77571678161621, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8454864025115967, "num_tokens": 97125829.0, "step": 2543 }, { "epoch": 0.32362294873425773, "ewc_loss": 0.042236328125, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.792264938354492, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8423367738723755, "num_tokens": 97162874.0, "step": 2544 }, { "epoch": 0.32375015901284826, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.69923210144043, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8509902954101562, "num_tokens": 97197559.0, "step": 2545 }, { "epoch": 0.32387736929143873, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.954561233520508, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8355033993721008, "num_tokens": 97227028.0, "step": 2546 }, { "epoch": 0.32400457957002926, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.376785278320312, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8439376354217529, "num_tokens": 97270209.0, "step": 2547 }, { "epoch": 0.3241317898486198, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.750743865966797, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8513127565383911, "num_tokens": 97309176.0, "step": 2548 }, { "epoch": 0.32425900012721026, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.68536376953125, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8274178504943848, "num_tokens": 97340464.0, "step": 2549 }, { "epoch": 0.3243862104058008, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.769519805908203, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8505755662918091, "num_tokens": 97378435.0, "step": 2550 }, { "epoch": 0.3245134206843913, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.39374542236328, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8443624973297119, "num_tokens": 97417886.0, "step": 2551 }, { "epoch": 0.3246406309629818, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.712072372436523, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8509842157363892, "num_tokens": 97450552.0, "step": 2552 }, { "epoch": 0.3247678412415723, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.89381980895996, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8553066253662109, "num_tokens": 97488586.0, "step": 2553 }, { "epoch": 0.32489505152016285, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.58319664001465, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8427755236625671, "num_tokens": 97528893.0, "step": 2554 }, { "epoch": 0.3250222617987533, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.647680282592773, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8252735733985901, "num_tokens": 97568654.0, "step": 2555 }, { "epoch": 0.32514947207734385, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.9558162689209, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8518849611282349, "num_tokens": 97607068.0, "step": 2556 }, { "epoch": 0.3252766823559344, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.763559341430664, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8699966669082642, "num_tokens": 97647505.0, "step": 2557 }, { "epoch": 0.32540389263452485, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.92724609375, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8417181372642517, "num_tokens": 97690505.0, "step": 2558 }, { "epoch": 0.3255311029131154, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.98565673828125, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8365548253059387, "num_tokens": 97723955.0, "step": 2559 }, { "epoch": 0.3256583131917059, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.689990997314453, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8425208926200867, "num_tokens": 97763822.0, "step": 2560 }, { "epoch": 0.3257855234702964, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.889297485351562, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8431035280227661, "num_tokens": 97803827.0, "step": 2561 }, { "epoch": 0.3259127337488869, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.026241302490234, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8476529121398926, "num_tokens": 97841056.0, "step": 2562 }, { "epoch": 0.32603994402747744, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.67931365966797, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8658794164657593, "num_tokens": 97881714.0, "step": 2563 }, { "epoch": 0.3261671543060679, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.02527618408203, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8446478843688965, "num_tokens": 97925797.0, "step": 2564 }, { "epoch": 0.32629436458465844, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.652170181274414, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.852184534072876, "num_tokens": 97963638.0, "step": 2565 }, { "epoch": 0.32642157486324896, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.231765747070312, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8413814306259155, "num_tokens": 98002043.0, "step": 2566 }, { "epoch": 0.32654878514183944, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.442373275756836, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8497680425643921, "num_tokens": 98041031.0, "step": 2567 }, { "epoch": 0.32667599542042997, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.90252685546875, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8556891679763794, "num_tokens": 98075510.0, "step": 2568 }, { "epoch": 0.3268032056990205, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.2901763916015625e-05, "grad_norm": 23.696170806884766, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8529888987541199, "num_tokens": 98114162.0, "step": 2569 }, { "epoch": 0.32693041597761097, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.971933364868164, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8426267504692078, "num_tokens": 98148722.0, "step": 2570 }, { "epoch": 0.3270576262562015, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 25.19748878479004, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.826522171497345, "num_tokens": 98188255.0, "step": 2571 }, { "epoch": 0.327184836534792, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 23.391332626342773, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.858378529548645, "num_tokens": 98228597.0, "step": 2572 }, { "epoch": 0.3273120468133825, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.86338996887207, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8393914699554443, "num_tokens": 98273405.0, "step": 2573 }, { "epoch": 0.327439257091973, "ewc_loss": 0.0419921875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.266334533691406e-05, "grad_norm": 24.08308219909668, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8330037593841553, "num_tokens": 98315091.0, "step": 2574 }, { "epoch": 0.32756646737056355, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.55063819885254, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8488280177116394, "num_tokens": 98356967.0, "step": 2575 }, { "epoch": 0.327693677649154, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.7537899017334, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.838068962097168, "num_tokens": 98394455.0, "step": 2576 }, { "epoch": 0.32782088792774455, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.754425048828125, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8564000129699707, "num_tokens": 98430455.0, "step": 2577 }, { "epoch": 0.3279480982063351, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.121143341064453, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8341761827468872, "num_tokens": 98472218.0, "step": 2578 }, { "epoch": 0.3280753084849256, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.871028900146484, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8547933101654053, "num_tokens": 98509540.0, "step": 2579 }, { "epoch": 0.3282025187635161, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.695512771606445, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8471289277076721, "num_tokens": 98544761.0, "step": 2580 }, { "epoch": 0.3283297290421066, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.818002700805664, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8605491518974304, "num_tokens": 98580522.0, "step": 2581 }, { "epoch": 0.32845693932069714, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.670007705688477, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8553417921066284, "num_tokens": 98612988.0, "step": 2582 }, { "epoch": 0.3285841495992876, "ewc_loss": 0.04248046875, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.82509422302246, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8365091681480408, "num_tokens": 98654213.0, "step": 2583 }, { "epoch": 0.32871135987787814, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.1447696685791, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8505781888961792, "num_tokens": 98697168.0, "step": 2584 }, { "epoch": 0.32883857015646867, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.599496841430664, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8533981442451477, "num_tokens": 98735664.0, "step": 2585 }, { "epoch": 0.32896578043505914, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.348051071166992, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8425216674804688, "num_tokens": 98777257.0, "step": 2586 }, { "epoch": 0.32909299071364967, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.032451629638672, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8511854410171509, "num_tokens": 98812443.0, "step": 2587 }, { "epoch": 0.3292202009922402, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.742572784423828, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8564035296440125, "num_tokens": 98848231.0, "step": 2588 }, { "epoch": 0.32934741127083067, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 24.1097469329834, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8522720336914062, "num_tokens": 98886487.0, "step": 2589 }, { "epoch": 0.3294746215494212, "ewc_loss": 0.042724609375, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.314018249511719e-05, "grad_norm": 23.2862491607666, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8377093076705933, "num_tokens": 98924750.0, "step": 2590 }, { "epoch": 0.3296018318280117, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.13414192199707, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8260526657104492, "num_tokens": 98964151.0, "step": 2591 }, { "epoch": 0.3297290421066022, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.259632110595703, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8579574823379517, "num_tokens": 99000277.0, "step": 2592 }, { "epoch": 0.3298562523851927, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.151344299316406, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8389075994491577, "num_tokens": 99042335.0, "step": 2593 }, { "epoch": 0.32998346266378326, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.444087982177734, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8457343578338623, "num_tokens": 99082964.0, "step": 2594 }, { "epoch": 0.33011067294237373, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.871482849121094, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8707675933837891, "num_tokens": 99123440.0, "step": 2595 }, { "epoch": 0.33023788322096426, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.82320213317871, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8468037843704224, "num_tokens": 99167047.0, "step": 2596 }, { "epoch": 0.3303650934995548, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 23.912797927856445, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.844852089881897, "num_tokens": 99202594.0, "step": 2597 }, { "epoch": 0.33049230377814526, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.806793212890625, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8539897203445435, "num_tokens": 99244732.0, "step": 2598 }, { "epoch": 0.3306195140567358, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 23.829694747924805, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8587313890457153, "num_tokens": 99287161.0, "step": 2599 }, { "epoch": 0.3307467243353263, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.020952224731445, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8412132263183594, "num_tokens": 99322650.0, "step": 2600 }, { "epoch": 0.3308739346139168, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.94587516784668, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.848685622215271, "num_tokens": 99358529.0, "step": 2601 }, { "epoch": 0.3310011448925073, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.796382904052734, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8575528264045715, "num_tokens": 99394010.0, "step": 2602 }, { "epoch": 0.33112835517109784, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.80263328552246, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8506105542182922, "num_tokens": 99434764.0, "step": 2603 }, { "epoch": 0.3312555654496883, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.042491912841797, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8501735329627991, "num_tokens": 99481322.0, "step": 2604 }, { "epoch": 0.33138277572827884, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.877208709716797, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8396053314208984, "num_tokens": 99517719.0, "step": 2605 }, { "epoch": 0.3315099860068694, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.195615768432617, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8511829376220703, "num_tokens": 99553662.0, "step": 2606 }, { "epoch": 0.33163719628545985, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.723867416381836, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8474748134613037, "num_tokens": 99589195.0, "step": 2607 }, { "epoch": 0.3317644065640504, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.281089782714844, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8506122827529907, "num_tokens": 99626092.0, "step": 2608 }, { "epoch": 0.3318916168426409, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.10015106201172, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8230158686637878, "num_tokens": 99667087.0, "step": 2609 }, { "epoch": 0.3320188271212314, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.18987274169922, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8447602987289429, "num_tokens": 99705929.0, "step": 2610 }, { "epoch": 0.3321460373998219, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.903263092041016, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8439100980758667, "num_tokens": 99742044.0, "step": 2611 }, { "epoch": 0.33227324767841243, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.38323211669922, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.851553201675415, "num_tokens": 99772621.0, "step": 2612 }, { "epoch": 0.3324004579570029, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.069978713989258, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8535791039466858, "num_tokens": 99814405.0, "step": 2613 }, { "epoch": 0.33252766823559343, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.17821502685547, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.85878586769104, "num_tokens": 99852197.0, "step": 2614 }, { "epoch": 0.33265487851418396, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.17925453186035, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8443709015846252, "num_tokens": 99894254.0, "step": 2615 }, { "epoch": 0.33278208879277443, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.431316375732422, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8417749404907227, "num_tokens": 99936230.0, "step": 2616 }, { "epoch": 0.33290929907136496, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.03544807434082, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8495373129844666, "num_tokens": 99974964.0, "step": 2617 }, { "epoch": 0.3330365093499555, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.1872615814209, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8391581773757935, "num_tokens": 100018750.0, "step": 2618 }, { "epoch": 0.33316371962854596, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.197782516479492, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8434078097343445, "num_tokens": 100054064.0, "step": 2619 }, { "epoch": 0.3332909299071365, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.951045989990234, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8533164262771606, "num_tokens": 100092549.0, "step": 2620 }, { "epoch": 0.333418140185727, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.304637908935547, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8221865892410278, "num_tokens": 100123973.0, "step": 2621 }, { "epoch": 0.3335453504643175, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.915739059448242, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8460111021995544, "num_tokens": 100162142.0, "step": 2622 }, { "epoch": 0.333672560742908, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.390907287597656, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8428083658218384, "num_tokens": 100199352.0, "step": 2623 }, { "epoch": 0.33379977102149855, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.389707565307617, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8329193592071533, "num_tokens": 100237446.0, "step": 2624 }, { "epoch": 0.333926981300089, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.523393630981445, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8540740013122559, "num_tokens": 100272986.0, "step": 2625 }, { "epoch": 0.33405419157867955, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.621793746948242, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8402218222618103, "num_tokens": 100304998.0, "step": 2626 }, { "epoch": 0.3341814018572701, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.825719833374023, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8451399803161621, "num_tokens": 100338473.0, "step": 2627 }, { "epoch": 0.33430861213586055, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.082796096801758, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.831536054611206, "num_tokens": 100372644.0, "step": 2628 }, { "epoch": 0.3344358224144511, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.376367568969727, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8278683423995972, "num_tokens": 100413248.0, "step": 2629 }, { "epoch": 0.3345630326930416, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.2280330657959, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.840883731842041, "num_tokens": 100448022.0, "step": 2630 }, { "epoch": 0.33469024297163213, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.238906860351562, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8363049626350403, "num_tokens": 100493222.0, "step": 2631 }, { "epoch": 0.3348174532502226, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 23.969663619995117, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8532687425613403, "num_tokens": 100534736.0, "step": 2632 }, { "epoch": 0.33494466352881314, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.300622940063477, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8559644222259521, "num_tokens": 100576754.0, "step": 2633 }, { "epoch": 0.33507187380740366, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.09611701965332, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8387612104415894, "num_tokens": 100611407.0, "step": 2634 }, { "epoch": 0.33519908408599414, "ewc_loss": 0.04296875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.337860107421875e-05, "grad_norm": 24.061372756958008, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8539513349533081, "num_tokens": 100643531.0, "step": 2635 }, { "epoch": 0.33532629436458466, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.102272033691406, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8482437133789062, "num_tokens": 100678612.0, "step": 2636 }, { "epoch": 0.3354535046431752, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.230762481689453, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8525822162628174, "num_tokens": 100718873.0, "step": 2637 }, { "epoch": 0.33558071492176567, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.354177474975586, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8454355597496033, "num_tokens": 100759883.0, "step": 2638 }, { "epoch": 0.3357079252003562, "ewc_loss": 0.043701171875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.969280242919922, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8280168771743774, "num_tokens": 100800022.0, "step": 2639 }, { "epoch": 0.3358351354789467, "ewc_loss": 0.043212890625, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.4516544342041, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8326929211616516, "num_tokens": 100837299.0, "step": 2640 }, { "epoch": 0.3359623457575372, "ewc_loss": 0.043701171875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.903486251831055, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8430896401405334, "num_tokens": 100868433.0, "step": 2641 }, { "epoch": 0.3360895560361277, "ewc_loss": 0.043701171875, "ewc_loss_diag": 9.59634780883789e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.49711799621582, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8336588144302368, "num_tokens": 100907768.0, "step": 2642 }, { "epoch": 0.33621676631471825, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.83721351623535, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.824029803276062, "num_tokens": 100942723.0, "step": 2643 }, { "epoch": 0.3363439765933087, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 24.321842193603516, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.853638768196106, "num_tokens": 100984997.0, "step": 2644 }, { "epoch": 0.33647118687189925, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.0091552734375, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8624221086502075, "num_tokens": 101026290.0, "step": 2645 }, { "epoch": 0.3365983971504898, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.36924934387207, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.84150230884552, "num_tokens": 101066766.0, "step": 2646 }, { "epoch": 0.33672560742908025, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.361701965332031e-05, "grad_norm": 23.78074836730957, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8516911864280701, "num_tokens": 101100833.0, "step": 2647 }, { "epoch": 0.3368528177076708, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 24.12306022644043, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8585163354873657, "num_tokens": 101145552.0, "step": 2648 }, { "epoch": 0.3369800279862613, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.223861694335938, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8494346141815186, "num_tokens": 101186644.0, "step": 2649 }, { "epoch": 0.3371072382648518, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 23.933713912963867, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8451938629150391, "num_tokens": 101223356.0, "step": 2650 }, { "epoch": 0.3372344485434423, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.121843338012695, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8610048294067383, "num_tokens": 101260527.0, "step": 2651 }, { "epoch": 0.33736165882203284, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 24.138742446899414, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8474693298339844, "num_tokens": 101305403.0, "step": 2652 }, { "epoch": 0.3374888691006233, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.080699920654297, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8329306244850159, "num_tokens": 101345858.0, "step": 2653 }, { "epoch": 0.33761607937921384, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.932958602905273, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8231030702590942, "num_tokens": 101380321.0, "step": 2654 }, { "epoch": 0.33774328965780437, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 24.16813850402832, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8561655879020691, "num_tokens": 101419120.0, "step": 2655 }, { "epoch": 0.33787049993639484, "ewc_loss": 0.04345703125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.3855438232421875e-05, "grad_norm": 24.01963996887207, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8488754034042358, "num_tokens": 101453272.0, "step": 2656 }, { "epoch": 0.33799771021498537, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.86141586303711, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8368419408798218, "num_tokens": 101486879.0, "step": 2657 }, { "epoch": 0.3381249204935759, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 23.88121223449707, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8398421406745911, "num_tokens": 101523691.0, "step": 2658 }, { "epoch": 0.33825213077216637, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.798521041870117, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8492063283920288, "num_tokens": 101555279.0, "step": 2659 }, { "epoch": 0.3383793410507569, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 24.079509735107422, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8640127182006836, "num_tokens": 101591780.0, "step": 2660 }, { "epoch": 0.3385065513293474, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 23.904850006103516, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8645898699760437, "num_tokens": 101631041.0, "step": 2661 }, { "epoch": 0.3386337616079379, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 24.064178466796875, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8368678092956543, "num_tokens": 101666315.0, "step": 2662 }, { "epoch": 0.33876097188652843, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.819074630737305, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8622369766235352, "num_tokens": 101704046.0, "step": 2663 }, { "epoch": 0.33888818216511896, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.48493194580078, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8586590886116028, "num_tokens": 101743076.0, "step": 2664 }, { "epoch": 0.33901539244370943, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.97443199157715, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8609251976013184, "num_tokens": 101784006.0, "step": 2665 }, { "epoch": 0.33914260272229996, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.30902099609375, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8540670871734619, "num_tokens": 101821845.0, "step": 2666 }, { "epoch": 0.3392698130008905, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.409385681152344e-05, "grad_norm": 24.133567810058594, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8539693355560303, "num_tokens": 101861121.0, "step": 2667 }, { "epoch": 0.33939702327948096, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 24.40397834777832, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8439748883247375, "num_tokens": 101894330.0, "step": 2668 }, { "epoch": 0.3395242335580715, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 23.73832130432129, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8332763910293579, "num_tokens": 101934835.0, "step": 2669 }, { "epoch": 0.339651443836662, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 24.375598907470703, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8314065933227539, "num_tokens": 101970943.0, "step": 2670 }, { "epoch": 0.3397786541152525, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.90253448486328, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8362418413162231, "num_tokens": 102001211.0, "step": 2671 }, { "epoch": 0.339905864393843, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.570281982421875, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8540894389152527, "num_tokens": 102039399.0, "step": 2672 }, { "epoch": 0.34003307467243354, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.947097778320312, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8371336460113525, "num_tokens": 102070635.0, "step": 2673 }, { "epoch": 0.340160284951024, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 24.260589599609375, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.852441668510437, "num_tokens": 102104590.0, "step": 2674 }, { "epoch": 0.34028749522961454, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.82253074645996, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8531608581542969, "num_tokens": 102149267.0, "step": 2675 }, { "epoch": 0.3404147055082051, "ewc_loss": 0.0439453125, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4332275390625e-05, "grad_norm": 24.085779190063477, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8331320285797119, "num_tokens": 102188089.0, "step": 2676 }, { "epoch": 0.34054191578679555, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.916139602661133, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8553228974342346, "num_tokens": 102224335.0, "step": 2677 }, { "epoch": 0.3406691260653861, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.918739318847656, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8518396615982056, "num_tokens": 102263421.0, "step": 2678 }, { "epoch": 0.3407963363439766, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.25938606262207, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8312556743621826, "num_tokens": 102301133.0, "step": 2679 }, { "epoch": 0.34092354662256713, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.936363220214844, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8526811599731445, "num_tokens": 102338516.0, "step": 2680 }, { "epoch": 0.3410507569011576, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.55561637878418, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8382433652877808, "num_tokens": 102376010.0, "step": 2681 }, { "epoch": 0.34117796717974813, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.13478660583496, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.870596170425415, "num_tokens": 102417080.0, "step": 2682 }, { "epoch": 0.34130517745833866, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.072397232055664, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8501906394958496, "num_tokens": 102458042.0, "step": 2683 }, { "epoch": 0.34143238773692913, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 24.146997451782227, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8547736406326294, "num_tokens": 102497152.0, "step": 2684 }, { "epoch": 0.34155959801551966, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.926664352416992, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8454862833023071, "num_tokens": 102531486.0, "step": 2685 }, { "epoch": 0.3416868082941102, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.245025634765625, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.846213698387146, "num_tokens": 102571552.0, "step": 2686 }, { "epoch": 0.34181401857270066, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.457069396972656e-05, "grad_norm": 23.860754013061523, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8531647324562073, "num_tokens": 102607338.0, "step": 2687 }, { "epoch": 0.3419412288512912, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.079195022583008, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8468803763389587, "num_tokens": 102654445.0, "step": 2688 }, { "epoch": 0.3420684391298817, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.950817108154297, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8555677533149719, "num_tokens": 102695265.0, "step": 2689 }, { "epoch": 0.3421956494084722, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.940814971923828, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8526166677474976, "num_tokens": 102728621.0, "step": 2690 }, { "epoch": 0.3423228596870627, "ewc_loss": 0.044677734375, "ewc_loss_diag": 9.715557098388672e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.2983341217041, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.842200756072998, "num_tokens": 102766378.0, "step": 2691 }, { "epoch": 0.34245006996565325, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.62400245666504, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8474913835525513, "num_tokens": 102806331.0, "step": 2692 }, { "epoch": 0.3425772802442437, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.09565544128418, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8501231670379639, "num_tokens": 102845314.0, "step": 2693 }, { "epoch": 0.34270449052283425, "ewc_loss": 0.04443359375, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 23.84341049194336, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8469772338867188, "num_tokens": 102888614.0, "step": 2694 }, { "epoch": 0.3428317008014248, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.911354064941406, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8433221578598022, "num_tokens": 102925988.0, "step": 2695 }, { "epoch": 0.34295891108001525, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 23.98149299621582, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.856587290763855, "num_tokens": 102963150.0, "step": 2696 }, { "epoch": 0.3430861213586058, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.081026077270508, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8553847074508667, "num_tokens": 102999164.0, "step": 2697 }, { "epoch": 0.3432133316371963, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.026819229125977, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.857306957244873, "num_tokens": 103034337.0, "step": 2698 }, { "epoch": 0.3433405419157868, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.93315887451172, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8549290299415588, "num_tokens": 103075062.0, "step": 2699 }, { "epoch": 0.3434677521943773, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.10393714904785, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8431054353713989, "num_tokens": 103113209.0, "step": 2700 }, { "epoch": 0.34359496247296784, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 23.998205184936523, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8523991107940674, "num_tokens": 103153117.0, "step": 2701 }, { "epoch": 0.3437221727515583, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.179697036743164, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8473898768424988, "num_tokens": 103192234.0, "step": 2702 }, { "epoch": 0.34384938303014884, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 23.972604751586914, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8454227447509766, "num_tokens": 103224958.0, "step": 2703 }, { "epoch": 0.34397659330873936, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.72920799255371, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.850271463394165, "num_tokens": 103259679.0, "step": 2704 }, { "epoch": 0.34410380358732984, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 23.78293800354004, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8403772711753845, "num_tokens": 103299987.0, "step": 2705 }, { "epoch": 0.34423101386592037, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.17680549621582, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8355145454406738, "num_tokens": 103344457.0, "step": 2706 }, { "epoch": 0.3443582241445109, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.196819305419922, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.853060245513916, "num_tokens": 103385173.0, "step": 2707 }, { "epoch": 0.34448543442310137, "ewc_loss": 0.044677734375, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.087867736816406, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8343135118484497, "num_tokens": 103425972.0, "step": 2708 }, { "epoch": 0.3446126447016919, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.065420150756836, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8451175689697266, "num_tokens": 103466819.0, "step": 2709 }, { "epoch": 0.3447398549802824, "ewc_loss": 0.044677734375, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.1719970703125, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8369019627571106, "num_tokens": 103509660.0, "step": 2710 }, { "epoch": 0.3448670652588729, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.32438850402832, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8430278897285461, "num_tokens": 103544557.0, "step": 2711 }, { "epoch": 0.3449942755374634, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.945568084716797, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8574403524398804, "num_tokens": 103585132.0, "step": 2712 }, { "epoch": 0.34512148581605395, "ewc_loss": 0.044677734375, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.141498565673828, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8572819828987122, "num_tokens": 103626353.0, "step": 2713 }, { "epoch": 0.3452486960946444, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.233991622924805, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8518630266189575, "num_tokens": 103659035.0, "step": 2714 }, { "epoch": 0.34537590637323495, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.14406967163086, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8560916185379028, "num_tokens": 103699497.0, "step": 2715 }, { "epoch": 0.3455031166518255, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.02498435974121, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8377917408943176, "num_tokens": 103733685.0, "step": 2716 }, { "epoch": 0.34563032693041595, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.59184455871582, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8321526050567627, "num_tokens": 103769399.0, "step": 2717 }, { "epoch": 0.3457575372090065, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.140323638916016, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8558183908462524, "num_tokens": 103809078.0, "step": 2718 }, { "epoch": 0.345884747487597, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.316679000854492, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8668442964553833, "num_tokens": 103846764.0, "step": 2719 }, { "epoch": 0.3460119577661875, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.01514434814453, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8591365814208984, "num_tokens": 103883089.0, "step": 2720 }, { "epoch": 0.346139168044778, "ewc_loss": 0.044677734375, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.4809112548828125e-05, "grad_norm": 24.2511043548584, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8656176328659058, "num_tokens": 103915775.0, "step": 2721 }, { "epoch": 0.34626637832336854, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.088294982910156, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8456590175628662, "num_tokens": 103953703.0, "step": 2722 }, { "epoch": 0.346393588601959, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.276430130004883, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8392983675003052, "num_tokens": 103990775.0, "step": 2723 }, { "epoch": 0.34652079888054954, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.922698974609375, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8521696329116821, "num_tokens": 104025927.0, "step": 2724 }, { "epoch": 0.34664800915914007, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.103086471557617, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8400475978851318, "num_tokens": 104065860.0, "step": 2725 }, { "epoch": 0.34677521943773054, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.386816024780273, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8518401384353638, "num_tokens": 104102117.0, "step": 2726 }, { "epoch": 0.34690242971632107, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.763025283813477, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8463336229324341, "num_tokens": 104140896.0, "step": 2727 }, { "epoch": 0.3470296399949116, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.41063690185547, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8477650880813599, "num_tokens": 104181355.0, "step": 2728 }, { "epoch": 0.3471568502735021, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.831315994262695, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8502621054649353, "num_tokens": 104218564.0, "step": 2729 }, { "epoch": 0.3472840605520926, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.316261291503906, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8537328243255615, "num_tokens": 104256553.0, "step": 2730 }, { "epoch": 0.3474112708306831, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 23.803264617919922, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8627681732177734, "num_tokens": 104291150.0, "step": 2731 }, { "epoch": 0.34753848110927366, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.76341438293457, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8416200876235962, "num_tokens": 104327977.0, "step": 2732 }, { "epoch": 0.34766569138786413, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.02157974243164, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8447256088256836, "num_tokens": 104365167.0, "step": 2733 }, { "epoch": 0.34779290166645466, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 25.406421661376953, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8545197248458862, "num_tokens": 104400304.0, "step": 2734 }, { "epoch": 0.3479201119450452, "ewc_loss": 0.044921875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.504753112792969e-05, "grad_norm": 24.121801376342773, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8345493078231812, "num_tokens": 104437974.0, "step": 2735 }, { "epoch": 0.34804732222363566, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.46900749206543, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8523131012916565, "num_tokens": 104474474.0, "step": 2736 }, { "epoch": 0.3481745325022262, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.222871780395508, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8353868722915649, "num_tokens": 104511004.0, "step": 2737 }, { "epoch": 0.3483017427808167, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.373394012451172, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8430014252662659, "num_tokens": 104551889.0, "step": 2738 }, { "epoch": 0.3484289530594072, "ewc_loss": 0.045166015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.049087524414062, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8335461616516113, "num_tokens": 104591962.0, "step": 2739 }, { "epoch": 0.3485561633379977, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.226627349853516, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8474979400634766, "num_tokens": 104636201.0, "step": 2740 }, { "epoch": 0.34868337361658824, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.1684627532959, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8483692407608032, "num_tokens": 104673448.0, "step": 2741 }, { "epoch": 0.3488105838951787, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.058012008666992, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8576151728630066, "num_tokens": 104708879.0, "step": 2742 }, { "epoch": 0.34893779417376924, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.0629825592041, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.859137773513794, "num_tokens": 104747073.0, "step": 2743 }, { "epoch": 0.3490650044523598, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.192920684814453, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8478658199310303, "num_tokens": 104786950.0, "step": 2744 }, { "epoch": 0.34919221473095025, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.345170974731445, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8539528250694275, "num_tokens": 104828306.0, "step": 2745 }, { "epoch": 0.3493194250095408, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.191808700561523, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.839840829372406, "num_tokens": 104867030.0, "step": 2746 }, { "epoch": 0.3494466352881313, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.17216682434082, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8526285886764526, "num_tokens": 104909182.0, "step": 2747 }, { "epoch": 0.3495738455667218, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.622371673583984, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8453487753868103, "num_tokens": 104950648.0, "step": 2748 }, { "epoch": 0.3497010558453123, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 23.93548583984375, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8550643920898438, "num_tokens": 104994780.0, "step": 2749 }, { "epoch": 0.34982826612390283, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.433345794677734, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8570091724395752, "num_tokens": 105029469.0, "step": 2750 }, { "epoch": 0.3499554764024933, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.117162704467773, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8429211378097534, "num_tokens": 105066797.0, "step": 2751 }, { "epoch": 0.35008268668108383, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.382532119750977, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8655611276626587, "num_tokens": 105100869.0, "step": 2752 }, { "epoch": 0.35020989695967436, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.034454345703125, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.861681342124939, "num_tokens": 105135409.0, "step": 2753 }, { "epoch": 0.35033710723826483, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.671430587768555, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8554585576057434, "num_tokens": 105172839.0, "step": 2754 }, { "epoch": 0.35046431751685536, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 23.908275604248047, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8557053804397583, "num_tokens": 105216656.0, "step": 2755 }, { "epoch": 0.3505915277954459, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 26.21090316772461, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8441494107246399, "num_tokens": 105253740.0, "step": 2756 }, { "epoch": 0.35071873807403636, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.528594970703125e-05, "grad_norm": 24.284482955932617, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8533014059066772, "num_tokens": 105290073.0, "step": 2757 }, { "epoch": 0.3508459483526269, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.248180389404297, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.855399489402771, "num_tokens": 105325933.0, "step": 2758 }, { "epoch": 0.3509731586312174, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.376388549804688, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.852651834487915, "num_tokens": 105362962.0, "step": 2759 }, { "epoch": 0.3511003689098079, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.26865005493164, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8505075573921204, "num_tokens": 105410074.0, "step": 2760 }, { "epoch": 0.3512275791883984, "ewc_loss": 0.04541015625, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.23309326171875, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8365929126739502, "num_tokens": 105450264.0, "step": 2761 }, { "epoch": 0.35135478946698895, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.60542106628418, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8407647609710693, "num_tokens": 105486566.0, "step": 2762 }, { "epoch": 0.3514819997455794, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.129789352416992, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8537378311157227, "num_tokens": 105527185.0, "step": 2763 }, { "epoch": 0.35160921002416995, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.488872528076172, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8472716808319092, "num_tokens": 105557401.0, "step": 2764 }, { "epoch": 0.3517364203027605, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.47120475769043, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8544051647186279, "num_tokens": 105586877.0, "step": 2765 }, { "epoch": 0.35186363058135095, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 23.946063995361328, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8532949686050415, "num_tokens": 105626988.0, "step": 2766 }, { "epoch": 0.3519908408599415, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.497072219848633, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8417094945907593, "num_tokens": 105665796.0, "step": 2767 }, { "epoch": 0.352118051138532, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.18859100341797, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8426458239555359, "num_tokens": 105708892.0, "step": 2768 }, { "epoch": 0.3522452614171225, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.237178802490234, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8440301418304443, "num_tokens": 105748221.0, "step": 2769 }, { "epoch": 0.352372471695713, "ewc_loss": 0.045654296875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 23.918758392333984, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8478804230690002, "num_tokens": 105784568.0, "step": 2770 }, { "epoch": 0.35249968197430354, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.359819412231445, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8387899398803711, "num_tokens": 105827223.0, "step": 2771 }, { "epoch": 0.352626892252894, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 23.898601531982422, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8671512603759766, "num_tokens": 105871014.0, "step": 2772 }, { "epoch": 0.35275410253148454, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.375715255737305, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8454664349555969, "num_tokens": 105908995.0, "step": 2773 }, { "epoch": 0.35288131281007507, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.084747314453125, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8440757989883423, "num_tokens": 105948243.0, "step": 2774 }, { "epoch": 0.35300852308866554, "ewc_loss": 0.04638671875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.22991180419922, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8513068556785583, "num_tokens": 105983997.0, "step": 2775 }, { "epoch": 0.35313573336725607, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.135395050048828, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8362845182418823, "num_tokens": 106020336.0, "step": 2776 }, { "epoch": 0.3532629436458466, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.24565887451172, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8516180515289307, "num_tokens": 106053300.0, "step": 2777 }, { "epoch": 0.35339015392443707, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.414888381958008, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8629553914070129, "num_tokens": 106091790.0, "step": 2778 }, { "epoch": 0.3535173642030276, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.30031394958496, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8539465665817261, "num_tokens": 106123768.0, "step": 2779 }, { "epoch": 0.3536445744816181, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.12774658203125, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8571460247039795, "num_tokens": 106158219.0, "step": 2780 }, { "epoch": 0.35377178476020865, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.554563522338867, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8446013927459717, "num_tokens": 106195795.0, "step": 2781 }, { "epoch": 0.3538989950387991, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.348665237426758, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8507578372955322, "num_tokens": 106234973.0, "step": 2782 }, { "epoch": 0.35402620531738965, "ewc_loss": 0.04638671875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.423460006713867, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8358755111694336, "num_tokens": 106267297.0, "step": 2783 }, { "epoch": 0.3541534155959802, "ewc_loss": 0.0458984375, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.132678985595703, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.856743574142456, "num_tokens": 106309070.0, "step": 2784 }, { "epoch": 0.35428062587457065, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.450336456298828, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8531970381736755, "num_tokens": 106345784.0, "step": 2785 }, { "epoch": 0.3544078361531612, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.384767532348633, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.856561005115509, "num_tokens": 106383172.0, "step": 2786 }, { "epoch": 0.3545350464317517, "ewc_loss": 0.04638671875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.195091247558594, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.84873366355896, "num_tokens": 106416916.0, "step": 2787 }, { "epoch": 0.3546622567103422, "ewc_loss": 0.04638671875, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.388134002685547, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8343824148178101, "num_tokens": 106456969.0, "step": 2788 }, { "epoch": 0.3547894669889327, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.630117416381836, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8468710780143738, "num_tokens": 106490476.0, "step": 2789 }, { "epoch": 0.35491667726752324, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 23.862369537353516, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8496081829071045, "num_tokens": 106525990.0, "step": 2790 }, { "epoch": 0.3550438875461137, "ewc_loss": 0.046142578125, "ewc_loss_diag": 9.953975677490234e-06, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.623329162597656, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8241497278213501, "num_tokens": 106562953.0, "step": 2791 }, { "epoch": 0.35517109782470424, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.194599151611328, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8511184453964233, "num_tokens": 106599142.0, "step": 2792 }, { "epoch": 0.35529830810329477, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.45071029663086, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8616100549697876, "num_tokens": 106636743.0, "step": 2793 }, { "epoch": 0.35542551838188524, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.137168884277344, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8533854484558105, "num_tokens": 106678992.0, "step": 2794 }, { "epoch": 0.35555272866047577, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.2430477142334, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8412476778030396, "num_tokens": 106719884.0, "step": 2795 }, { "epoch": 0.3556799389390663, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.243812561035156, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8461350798606873, "num_tokens": 106755224.0, "step": 2796 }, { "epoch": 0.35580714921765677, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.23890495300293, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.860709547996521, "num_tokens": 106789366.0, "step": 2797 }, { "epoch": 0.3559343594962473, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.155973434448242, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8405017256736755, "num_tokens": 106826382.0, "step": 2798 }, { "epoch": 0.3560615697748378, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.541105270385742, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8397874236106873, "num_tokens": 106860027.0, "step": 2799 }, { "epoch": 0.3561887800534283, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 25.290485382080078, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8575282096862793, "num_tokens": 106898684.0, "step": 2800 }, { "epoch": 0.35631599033201883, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.342002868652344, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8496583700180054, "num_tokens": 106934496.0, "step": 2801 }, { "epoch": 0.35644320061060936, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 26.332351684570312, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8639086484909058, "num_tokens": 106974335.0, "step": 2802 }, { "epoch": 0.35657041088919983, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.552436828613281e-05, "grad_norm": 24.502140045166016, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.823229193687439, "num_tokens": 107013914.0, "step": 2803 }, { "epoch": 0.35669762116779036, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.410985946655273, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8549531698226929, "num_tokens": 107053747.0, "step": 2804 }, { "epoch": 0.3568248314463809, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.873098373413086, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8388416767120361, "num_tokens": 107095245.0, "step": 2805 }, { "epoch": 0.35695204172497136, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.929443359375, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8504164218902588, "num_tokens": 107133627.0, "step": 2806 }, { "epoch": 0.3570792520035619, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.5762786865234375e-05, "grad_norm": 24.13507843017578, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8261724710464478, "num_tokens": 107170128.0, "step": 2807 }, { "epoch": 0.3572064622821524, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.209516525268555, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8609427213668823, "num_tokens": 107209137.0, "step": 2808 }, { "epoch": 0.3573336725607429, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.193662643432617, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8452653884887695, "num_tokens": 107244866.0, "step": 2809 }, { "epoch": 0.3574608828393334, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.46392250061035, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.848932147026062, "num_tokens": 107287399.0, "step": 2810 }, { "epoch": 0.35758809311792394, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.49786376953125, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8407065868377686, "num_tokens": 107325627.0, "step": 2811 }, { "epoch": 0.3577153033965144, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.30083656311035, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8653378486633301, "num_tokens": 107359375.0, "step": 2812 }, { "epoch": 0.35784251367510495, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.335111618041992, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8380420804023743, "num_tokens": 107400286.0, "step": 2813 }, { "epoch": 0.3579697239536955, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.126981735229492, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8400087356567383, "num_tokens": 107431768.0, "step": 2814 }, { "epoch": 0.35809693423228595, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.37588882446289, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8592177033424377, "num_tokens": 107470875.0, "step": 2815 }, { "epoch": 0.3582241445108765, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.195138931274414, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8407443761825562, "num_tokens": 107510988.0, "step": 2816 }, { "epoch": 0.358351354789467, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.473037719726562, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8576123714447021, "num_tokens": 107552423.0, "step": 2817 }, { "epoch": 0.3584785650680575, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.25958251953125, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8412092328071594, "num_tokens": 107591693.0, "step": 2818 }, { "epoch": 0.358605775346648, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.590682983398438, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8537729382514954, "num_tokens": 107626017.0, "step": 2819 }, { "epoch": 0.35873298562523853, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.463665008544922, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8660842180252075, "num_tokens": 107664927.0, "step": 2820 }, { "epoch": 0.358860195903829, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.587421417236328, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8419989943504333, "num_tokens": 107702487.0, "step": 2821 }, { "epoch": 0.35898740618241953, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.518396377563477, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8625109195709229, "num_tokens": 107741263.0, "step": 2822 }, { "epoch": 0.35911461646101006, "ewc_loss": 0.0458984375, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.434232711791992, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8522648811340332, "num_tokens": 107781079.0, "step": 2823 }, { "epoch": 0.35924182673960053, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.32513999938965, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8689125776290894, "num_tokens": 107819309.0, "step": 2824 }, { "epoch": 0.35936903701819106, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.420530319213867, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8516467213630676, "num_tokens": 107853251.0, "step": 2825 }, { "epoch": 0.3594962472967816, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.509679794311523, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8691133856773376, "num_tokens": 107893476.0, "step": 2826 }, { "epoch": 0.35962345757537206, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.41588020324707, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.85907381772995, "num_tokens": 107931204.0, "step": 2827 }, { "epoch": 0.3597506678539626, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0132789611816406e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.614425659179688, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8313772678375244, "num_tokens": 107966471.0, "step": 2828 }, { "epoch": 0.3598778781325531, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.455610275268555, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8513780832290649, "num_tokens": 108005328.0, "step": 2829 }, { "epoch": 0.36000508841114365, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.68305778503418, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8395183086395264, "num_tokens": 108046295.0, "step": 2830 }, { "epoch": 0.3601322986897341, "ewc_loss": 0.04638671875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.62396240234375e-05, "grad_norm": 24.112550735473633, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8515864610671997, "num_tokens": 108078164.0, "step": 2831 }, { "epoch": 0.36025950896832465, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.85143280029297, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8452521562576294, "num_tokens": 108117105.0, "step": 2832 }, { "epoch": 0.3603867192469152, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.34170913696289, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8463999032974243, "num_tokens": 108155301.0, "step": 2833 }, { "epoch": 0.36051392952550565, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.910259246826172, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8597638607025146, "num_tokens": 108193091.0, "step": 2834 }, { "epoch": 0.3606411398040962, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.27521324157715, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8470876216888428, "num_tokens": 108233719.0, "step": 2835 }, { "epoch": 0.3607683500826867, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.581796646118164, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8564473390579224, "num_tokens": 108265775.0, "step": 2836 }, { "epoch": 0.3608955603612772, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.375547409057617, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8599246740341187, "num_tokens": 108307953.0, "step": 2837 }, { "epoch": 0.3610227706398677, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 25.162216186523438, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8464884757995605, "num_tokens": 108351998.0, "step": 2838 }, { "epoch": 0.36114998091845824, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.380332946777344, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8372892141342163, "num_tokens": 108395968.0, "step": 2839 }, { "epoch": 0.3612771911970487, "ewc_loss": 0.047119140625, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 25.183382034301758, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8319851756095886, "num_tokens": 108433045.0, "step": 2840 }, { "epoch": 0.36140440147563924, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.40826988220215, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8446208238601685, "num_tokens": 108470964.0, "step": 2841 }, { "epoch": 0.36153161175422976, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 25.325029373168945, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8506592512130737, "num_tokens": 108512572.0, "step": 2842 }, { "epoch": 0.36165882203282024, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.408926010131836, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8359365463256836, "num_tokens": 108548526.0, "step": 2843 }, { "epoch": 0.36178603231141077, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 25.172924041748047, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8478986024856567, "num_tokens": 108588447.0, "step": 2844 }, { "epoch": 0.3619132425900013, "ewc_loss": 0.046142578125, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.600120544433594e-05, "grad_norm": 24.608381271362305, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8396687507629395, "num_tokens": 108628426.0, "step": 2845 }, { "epoch": 0.36204045286859177, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.830888748168945, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8439721465110779, "num_tokens": 108661355.0, "step": 2846 }, { "epoch": 0.3621676631471823, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.913436889648438, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8656928539276123, "num_tokens": 108698505.0, "step": 2847 }, { "epoch": 0.3622948734257728, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.795446395874023, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8545113801956177, "num_tokens": 108737409.0, "step": 2848 }, { "epoch": 0.3624220837043633, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.658187866210938, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8508040904998779, "num_tokens": 108772969.0, "step": 2849 }, { "epoch": 0.3625492939829538, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 25.096908569335938, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8561734557151794, "num_tokens": 108805129.0, "step": 2850 }, { "epoch": 0.36267650426154435, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.433931350708008, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8348557949066162, "num_tokens": 108841197.0, "step": 2851 }, { "epoch": 0.3628037145401348, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 25.080322265625, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8630716800689697, "num_tokens": 108876442.0, "step": 2852 }, { "epoch": 0.36293092481872535, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.576025009155273, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8352075815200806, "num_tokens": 108911432.0, "step": 2853 }, { "epoch": 0.3630581350973159, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.621904373168945, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8335607051849365, "num_tokens": 108952977.0, "step": 2854 }, { "epoch": 0.36318534537590635, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.72784423828125, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8251925706863403, "num_tokens": 108995602.0, "step": 2855 }, { "epoch": 0.3633125556544969, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.46996307373047, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8520334959030151, "num_tokens": 109040226.0, "step": 2856 }, { "epoch": 0.3634397659330874, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.455944061279297, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8524262309074402, "num_tokens": 109082898.0, "step": 2857 }, { "epoch": 0.3635669762116779, "ewc_loss": 0.046630859375, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.647804260253906e-05, "grad_norm": 24.650848388671875, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8404318690299988, "num_tokens": 109122460.0, "step": 2858 }, { "epoch": 0.3636941864902684, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.539608001708984, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8489692211151123, "num_tokens": 109161680.0, "step": 2859 }, { "epoch": 0.36382139676885894, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.640043258666992, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8679109811782837, "num_tokens": 109198943.0, "step": 2860 }, { "epoch": 0.3639486070474494, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.774091720581055, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8578367829322815, "num_tokens": 109235719.0, "step": 2861 }, { "epoch": 0.36407581732603994, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.52785301208496, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.841681718826294, "num_tokens": 109272681.0, "step": 2862 }, { "epoch": 0.36420302760463047, "ewc_loss": 0.046875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.70339584350586, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8380765914916992, "num_tokens": 109306114.0, "step": 2863 }, { "epoch": 0.36433023788322094, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.52066993713379, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8409404754638672, "num_tokens": 109337728.0, "step": 2864 }, { "epoch": 0.36445744816181147, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.57193946838379, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8399155139923096, "num_tokens": 109377809.0, "step": 2865 }, { "epoch": 0.364584658440402, "ewc_loss": 0.047119140625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.6716461181640625e-05, "grad_norm": 24.625085830688477, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8635098338127136, "num_tokens": 109413499.0, "step": 2866 }, { "epoch": 0.36471186871899247, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.541767120361328, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8400552868843079, "num_tokens": 109460961.0, "step": 2867 }, { "epoch": 0.364839078997583, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.86876106262207, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8606402277946472, "num_tokens": 109500615.0, "step": 2868 }, { "epoch": 0.36496628927617353, "ewc_loss": 0.047119140625, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.473058700561523, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8515158891677856, "num_tokens": 109533921.0, "step": 2869 }, { "epoch": 0.365093499554764, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.747610092163086, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8392884731292725, "num_tokens": 109567069.0, "step": 2870 }, { "epoch": 0.36522070983335453, "ewc_loss": 0.047119140625, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.31239128112793, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8627191781997681, "num_tokens": 109603696.0, "step": 2871 }, { "epoch": 0.36534792011194506, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.973432540893555, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8667036890983582, "num_tokens": 109644590.0, "step": 2872 }, { "epoch": 0.36547513039053553, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.695487976074219e-05, "grad_norm": 24.344179153442383, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8450056314468384, "num_tokens": 109678869.0, "step": 2873 }, { "epoch": 0.36560234066912606, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.72052574157715, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8434464931488037, "num_tokens": 109722517.0, "step": 2874 }, { "epoch": 0.3657295509477166, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.49515151977539, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8461968302726746, "num_tokens": 109757869.0, "step": 2875 }, { "epoch": 0.36585676122630706, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.72321891784668, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8444846272468567, "num_tokens": 109793162.0, "step": 2876 }, { "epoch": 0.3659839715048976, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.35247039794922, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8558177947998047, "num_tokens": 109832942.0, "step": 2877 }, { "epoch": 0.3661111817834881, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.697185516357422, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8470743894577026, "num_tokens": 109876230.0, "step": 2878 }, { "epoch": 0.3662383920620786, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.46011734008789, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8395049571990967, "num_tokens": 109911844.0, "step": 2879 }, { "epoch": 0.3663656023406691, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 25.03847312927246, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8368642330169678, "num_tokens": 109957619.0, "step": 2880 }, { "epoch": 0.36649281261925964, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.34246826171875, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8386934995651245, "num_tokens": 109999117.0, "step": 2881 }, { "epoch": 0.3666200228978502, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.9619197845459, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.856553316116333, "num_tokens": 110039557.0, "step": 2882 }, { "epoch": 0.36674723317644065, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.616710662841797, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8406165838241577, "num_tokens": 110074418.0, "step": 2883 }, { "epoch": 0.3668744434550312, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.87801170349121, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8550394773483276, "num_tokens": 110109255.0, "step": 2884 }, { "epoch": 0.3670016537336217, "ewc_loss": 0.04736328125, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.655426025390625, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8545796871185303, "num_tokens": 110150180.0, "step": 2885 }, { "epoch": 0.3671288640122122, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.893341064453125, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8325813412666321, "num_tokens": 110193653.0, "step": 2886 }, { "epoch": 0.3672560742908027, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.83012580871582, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8565622568130493, "num_tokens": 110231020.0, "step": 2887 }, { "epoch": 0.36738328456939323, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.656728744506836, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8652857542037964, "num_tokens": 110269397.0, "step": 2888 }, { "epoch": 0.3675104948479837, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.589611053466797, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8644313812255859, "num_tokens": 110311188.0, "step": 2889 }, { "epoch": 0.36763770512657423, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.609342575073242, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8537948131561279, "num_tokens": 110343801.0, "step": 2890 }, { "epoch": 0.36776491540516476, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.74026870727539, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8328211307525635, "num_tokens": 110385830.0, "step": 2891 }, { "epoch": 0.36789212568375523, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.881431579589844, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8481484651565552, "num_tokens": 110421494.0, "step": 2892 }, { "epoch": 0.36801933596234576, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.580463409423828, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8484309911727905, "num_tokens": 110463132.0, "step": 2893 }, { "epoch": 0.3681465462409363, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.40436553955078, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.853090226650238, "num_tokens": 110498330.0, "step": 2894 }, { "epoch": 0.36827375651952676, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.549861907958984, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8588290214538574, "num_tokens": 110534438.0, "step": 2895 }, { "epoch": 0.3684009667981173, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.705421447753906, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8454605937004089, "num_tokens": 110572059.0, "step": 2896 }, { "epoch": 0.3685281770767078, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.394306182861328, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8509957790374756, "num_tokens": 110611012.0, "step": 2897 }, { "epoch": 0.3686553873552983, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 25.052833557128906, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8576347231864929, "num_tokens": 110647429.0, "step": 2898 }, { "epoch": 0.3687825976338888, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.46564483642578, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.842872142791748, "num_tokens": 110685052.0, "step": 2899 }, { "epoch": 0.36890980791247935, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.932613372802734, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8284716010093689, "num_tokens": 110725222.0, "step": 2900 }, { "epoch": 0.3690370181910698, "ewc_loss": 0.047607421875, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.719329833984375e-05, "grad_norm": 24.57630157470703, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8385376334190369, "num_tokens": 110764314.0, "step": 2901 }, { "epoch": 0.36916422846966035, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.650022506713867, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.861929178237915, "num_tokens": 110803585.0, "step": 2902 }, { "epoch": 0.3692914387482509, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.561063766479492, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8439103960990906, "num_tokens": 110846532.0, "step": 2903 }, { "epoch": 0.36941864902684135, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.530729293823242, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8466819524765015, "num_tokens": 110889807.0, "step": 2904 }, { "epoch": 0.3695458593054319, "ewc_loss": 0.0478515625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.424169540405273, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8303804397583008, "num_tokens": 110930567.0, "step": 2905 }, { "epoch": 0.3696730695840224, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.349382400512695, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8558729887008667, "num_tokens": 110968725.0, "step": 2906 }, { "epoch": 0.3698002798626129, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.650203704833984, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8738978505134583, "num_tokens": 111002724.0, "step": 2907 }, { "epoch": 0.3699274901412034, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.566774368286133, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8478126525878906, "num_tokens": 111036668.0, "step": 2908 }, { "epoch": 0.37005470041979394, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.24907112121582, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8403805494308472, "num_tokens": 111077712.0, "step": 2909 }, { "epoch": 0.3701819106983844, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.720224380493164, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.840173602104187, "num_tokens": 111118856.0, "step": 2910 }, { "epoch": 0.37030912097697494, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.401165008544922, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8408728837966919, "num_tokens": 111156712.0, "step": 2911 }, { "epoch": 0.37043633125556547, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.469255447387695, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8504654169082642, "num_tokens": 111195542.0, "step": 2912 }, { "epoch": 0.37056354153415594, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.6195125579834, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8316611051559448, "num_tokens": 111237827.0, "step": 2913 }, { "epoch": 0.37069075181274647, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.44534683227539, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8574701547622681, "num_tokens": 111278802.0, "step": 2914 }, { "epoch": 0.370817962091337, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.553180694580078, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8473127484321594, "num_tokens": 111313795.0, "step": 2915 }, { "epoch": 0.37094517236992747, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.6991024017334, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8552700281143188, "num_tokens": 111353798.0, "step": 2916 }, { "epoch": 0.371072382648518, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.742403030395508, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8488200902938843, "num_tokens": 111390613.0, "step": 2917 }, { "epoch": 0.3711995929271085, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.591379165649414, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8496063947677612, "num_tokens": 111432043.0, "step": 2918 }, { "epoch": 0.371326803205699, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.638389587402344, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8559246063232422, "num_tokens": 111476575.0, "step": 2919 }, { "epoch": 0.3714540134842895, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.42267417907715, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8484965562820435, "num_tokens": 111515304.0, "step": 2920 }, { "epoch": 0.37158122376288005, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.7275390625, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8589329719543457, "num_tokens": 111550246.0, "step": 2921 }, { "epoch": 0.3717084340414705, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.94495964050293, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8503441214561462, "num_tokens": 111592189.0, "step": 2922 }, { "epoch": 0.37183564432006105, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.674671173095703, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.852614164352417, "num_tokens": 111638177.0, "step": 2923 }, { "epoch": 0.3719628545986516, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.533649444580078, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8516282439231873, "num_tokens": 111674365.0, "step": 2924 }, { "epoch": 0.37209006487724205, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.4601993560791, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8453218936920166, "num_tokens": 111714571.0, "step": 2925 }, { "epoch": 0.3722172751558326, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.943744659423828, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8498815298080444, "num_tokens": 111755214.0, "step": 2926 }, { "epoch": 0.3723444854344231, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.447782516479492, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8223127126693726, "num_tokens": 111796516.0, "step": 2927 }, { "epoch": 0.3724716957130136, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.895389556884766, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8355118036270142, "num_tokens": 111832615.0, "step": 2928 }, { "epoch": 0.3725989059916041, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.572702407836914, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8393644094467163, "num_tokens": 111874682.0, "step": 2929 }, { "epoch": 0.37272611627019464, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.98069190979004, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8363966941833496, "num_tokens": 111916369.0, "step": 2930 }, { "epoch": 0.37285332654878517, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.043081283569336e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.51979637145996, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8405969142913818, "num_tokens": 111950671.0, "step": 2931 }, { "epoch": 0.37298053682737564, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.88066291809082, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8324222564697266, "num_tokens": 111988846.0, "step": 2932 }, { "epoch": 0.37310774710596617, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.64951515197754, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.857166588306427, "num_tokens": 112022853.0, "step": 2933 }, { "epoch": 0.3732349573845567, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.8984375, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8450280427932739, "num_tokens": 112065631.0, "step": 2934 }, { "epoch": 0.37336216766314717, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.94412612915039, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8387091159820557, "num_tokens": 112107236.0, "step": 2935 }, { "epoch": 0.3734893779417377, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.002944946289062, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8268555402755737, "num_tokens": 112153914.0, "step": 2936 }, { "epoch": 0.3736165882203282, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.770227432250977, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8509032726287842, "num_tokens": 112187596.0, "step": 2937 }, { "epoch": 0.3737437984989187, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.97001838684082, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8445529937744141, "num_tokens": 112222409.0, "step": 2938 }, { "epoch": 0.37387100877750923, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.6180419921875, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8370116949081421, "num_tokens": 112257699.0, "step": 2939 }, { "epoch": 0.37399821905609976, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.98914909362793, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.857656717300415, "num_tokens": 112292431.0, "step": 2940 }, { "epoch": 0.37412542933469023, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.82156753540039, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8507001399993896, "num_tokens": 112331241.0, "step": 2941 }, { "epoch": 0.37425263961328076, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.679914474487305, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8450039625167847, "num_tokens": 112368486.0, "step": 2942 }, { "epoch": 0.3743798498918713, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.75969886779785, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8397170305252075, "num_tokens": 112409638.0, "step": 2943 }, { "epoch": 0.37450706017046176, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.874338150024414, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8440905809402466, "num_tokens": 112448257.0, "step": 2944 }, { "epoch": 0.3746342704490523, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.566570281982422, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8637889623641968, "num_tokens": 112488739.0, "step": 2945 }, { "epoch": 0.3747614807276428, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.994199752807617, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.834202229976654, "num_tokens": 112528621.0, "step": 2946 }, { "epoch": 0.3748886910062333, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.667871475219727, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8427589535713196, "num_tokens": 112564615.0, "step": 2947 }, { "epoch": 0.3750159012848238, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.457441329956055, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8373953700065613, "num_tokens": 112598190.0, "step": 2948 }, { "epoch": 0.37514311156341434, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.785188674926758, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8478977084159851, "num_tokens": 112632078.0, "step": 2949 }, { "epoch": 0.3752703218420048, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.09128189086914, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8256157636642456, "num_tokens": 112667848.0, "step": 2950 }, { "epoch": 0.37539753212059535, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.0158748626709, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8466482162475586, "num_tokens": 112707037.0, "step": 2951 }, { "epoch": 0.3755247423991859, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.121850967407227, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8576076030731201, "num_tokens": 112751921.0, "step": 2952 }, { "epoch": 0.37565195267777635, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.92729377746582, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8521661758422852, "num_tokens": 112793061.0, "step": 2953 }, { "epoch": 0.3757791629563669, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.12393569946289, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8580699563026428, "num_tokens": 112833852.0, "step": 2954 }, { "epoch": 0.3759063732349574, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.631620407104492, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8439721465110779, "num_tokens": 112866718.0, "step": 2955 }, { "epoch": 0.3760335835135479, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.422399520874023, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8499778509140015, "num_tokens": 112909420.0, "step": 2956 }, { "epoch": 0.3761607937921384, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.701810836791992, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8504266142845154, "num_tokens": 112944570.0, "step": 2957 }, { "epoch": 0.37628800407072893, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.280479431152344, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.862132728099823, "num_tokens": 112984107.0, "step": 2958 }, { "epoch": 0.3764152143493194, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.635896682739258, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8708407282829285, "num_tokens": 113023066.0, "step": 2959 }, { "epoch": 0.37654242462790993, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.982967376708984, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8457542657852173, "num_tokens": 113060093.0, "step": 2960 }, { "epoch": 0.37666963490650046, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.981767654418945, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8414140343666077, "num_tokens": 113095940.0, "step": 2961 }, { "epoch": 0.37679684518509093, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.965679168701172, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8593250513076782, "num_tokens": 113130699.0, "step": 2962 }, { "epoch": 0.37692405546368146, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.156164169311523, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8485575914382935, "num_tokens": 113169145.0, "step": 2963 }, { "epoch": 0.377051265742272, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.87681007385254, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.863240659236908, "num_tokens": 113201987.0, "step": 2964 }, { "epoch": 0.37717847602086246, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.23971176147461, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8546784520149231, "num_tokens": 113239588.0, "step": 2965 }, { "epoch": 0.377305686299453, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.060619354248047, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8502136468887329, "num_tokens": 113277075.0, "step": 2966 }, { "epoch": 0.3774328965780435, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.046300888061523, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8769931793212891, "num_tokens": 113313708.0, "step": 2967 }, { "epoch": 0.377560106856634, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.101158142089844, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8293412923812866, "num_tokens": 113344027.0, "step": 2968 }, { "epoch": 0.3776873171352245, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.22756576538086, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8540833592414856, "num_tokens": 113388444.0, "step": 2969 }, { "epoch": 0.37781452741381505, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.842838287353516, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8445524573326111, "num_tokens": 113428660.0, "step": 2970 }, { "epoch": 0.3779417376924055, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.484128952026367, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8276544809341431, "num_tokens": 113469160.0, "step": 2971 }, { "epoch": 0.37806894797099605, "ewc_loss": 0.048095703125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.743171691894531e-05, "grad_norm": 24.629194259643555, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8240707516670227, "num_tokens": 113504291.0, "step": 2972 }, { "epoch": 0.3781961582495866, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.159866333007812, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8509662747383118, "num_tokens": 113548085.0, "step": 2973 }, { "epoch": 0.37832336852817705, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.49355125427246, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8530517220497131, "num_tokens": 113581664.0, "step": 2974 }, { "epoch": 0.3784505788067676, "ewc_loss": 0.048583984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 25.23550033569336, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8622857332229614, "num_tokens": 113618552.0, "step": 2975 }, { "epoch": 0.3785777890853581, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.7670135498046875e-05, "grad_norm": 24.739511489868164, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8282978534698486, "num_tokens": 113653406.0, "step": 2976 }, { "epoch": 0.3787049993639486, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.956327438354492, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8575709462165833, "num_tokens": 113693803.0, "step": 2977 }, { "epoch": 0.3788322096425391, "ewc_loss": 0.04833984375, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.790855407714844e-05, "grad_norm": 24.476699829101562, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8639926910400391, "num_tokens": 113734637.0, "step": 2978 }, { "epoch": 0.37895941992112964, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.002344131469727, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8421220779418945, "num_tokens": 113776579.0, "step": 2979 }, { "epoch": 0.37908663019972016, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.797344207763672, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8388401865959167, "num_tokens": 113818857.0, "step": 2980 }, { "epoch": 0.37921384047831064, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.923982620239258, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8404229879379272, "num_tokens": 113853248.0, "step": 2981 }, { "epoch": 0.37934105075690117, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.67787742614746, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8555029034614563, "num_tokens": 113894206.0, "step": 2982 }, { "epoch": 0.3794682610354917, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.99818229675293, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8563388586044312, "num_tokens": 113931309.0, "step": 2983 }, { "epoch": 0.37959547131408217, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.7015323638916, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8393261432647705, "num_tokens": 113976559.0, "step": 2984 }, { "epoch": 0.3797226815926727, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.823881149291992, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8478261232376099, "num_tokens": 114010814.0, "step": 2985 }, { "epoch": 0.3798498918712632, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.88157844543457, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8681628704071045, "num_tokens": 114049389.0, "step": 2986 }, { "epoch": 0.3799771021498537, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.913394927978516, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8669329285621643, "num_tokens": 114081463.0, "step": 2987 }, { "epoch": 0.3801043124284442, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.915422439575195, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8645107746124268, "num_tokens": 114117175.0, "step": 2988 }, { "epoch": 0.38023152270703475, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.960426330566406, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8385921716690063, "num_tokens": 114153764.0, "step": 2989 }, { "epoch": 0.3803587329856252, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.990718841552734, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8488445281982422, "num_tokens": 114191487.0, "step": 2990 }, { "epoch": 0.38048594326421575, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.064390182495117, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8248895406723022, "num_tokens": 114233880.0, "step": 2991 }, { "epoch": 0.3806131535428063, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.218446731567383, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8478706479072571, "num_tokens": 114269238.0, "step": 2992 }, { "epoch": 0.38074036382139675, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.82395362854004, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8612021803855896, "num_tokens": 114307645.0, "step": 2993 }, { "epoch": 0.3808675740999873, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 25.07398796081543, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.853415310382843, "num_tokens": 114341800.0, "step": 2994 }, { "epoch": 0.3809947843785778, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.89641571044922, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8576840162277222, "num_tokens": 114378363.0, "step": 2995 }, { "epoch": 0.3811219946571683, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.06424903869629, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8385747075080872, "num_tokens": 114414489.0, "step": 2996 }, { "epoch": 0.3812492049357588, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.034090042114258, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8439062833786011, "num_tokens": 114449197.0, "step": 2997 }, { "epoch": 0.38137641521434934, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.217979431152344, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8474299311637878, "num_tokens": 114478654.0, "step": 2998 }, { "epoch": 0.3815036254929398, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.052522659301758, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8581470251083374, "num_tokens": 114521807.0, "step": 2999 }, { "epoch": 0.38163083577153034, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.258344650268555, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8629826903343201, "num_tokens": 114558297.0, "step": 3000 }, { "epoch": 0.38175804605012087, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.802051544189453, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8551739454269409, "num_tokens": 114599630.0, "step": 3001 }, { "epoch": 0.38188525632871134, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.205657958984375, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8478278517723083, "num_tokens": 114638697.0, "step": 3002 }, { "epoch": 0.38201246660730187, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.580965042114258, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.850102424621582, "num_tokens": 114676106.0, "step": 3003 }, { "epoch": 0.3821396768858924, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.329448699951172, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8470536470413208, "num_tokens": 114712026.0, "step": 3004 }, { "epoch": 0.38226688716448287, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.758716583251953, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8270428776741028, "num_tokens": 114752348.0, "step": 3005 }, { "epoch": 0.3823940974430734, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.42874526977539, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8540983200073242, "num_tokens": 114789922.0, "step": 3006 }, { "epoch": 0.38252130772166393, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.53899383544922, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8440070748329163, "num_tokens": 114838834.0, "step": 3007 }, { "epoch": 0.3826485180002544, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.367603302001953, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8276085257530212, "num_tokens": 114873320.0, "step": 3008 }, { "epoch": 0.38277572827884493, "ewc_loss": 0.048828125, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.814697265625e-05, "grad_norm": 24.79815673828125, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8610349893569946, "num_tokens": 114908293.0, "step": 3009 }, { "epoch": 0.38290293855743546, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 24.842832565307617, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.839653491973877, "num_tokens": 114950083.0, "step": 3010 }, { "epoch": 0.38303014883602593, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.05495262145996, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8520504236221313, "num_tokens": 114982353.0, "step": 3011 }, { "epoch": 0.38315735911461646, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.6469669342041, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8542916774749756, "num_tokens": 115017668.0, "step": 3012 }, { "epoch": 0.383284569393207, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.001941680908203, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8602116107940674, "num_tokens": 115055266.0, "step": 3013 }, { "epoch": 0.38341177967179746, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 24.63857650756836, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8450028896331787, "num_tokens": 115089728.0, "step": 3014 }, { "epoch": 0.383538989950388, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.516511917114258, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8344101905822754, "num_tokens": 115135192.0, "step": 3015 }, { "epoch": 0.3836662002289785, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.644668579101562, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.831981897354126, "num_tokens": 115170350.0, "step": 3016 }, { "epoch": 0.383793410507569, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.16519546508789, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.857210099697113, "num_tokens": 115202724.0, "step": 3017 }, { "epoch": 0.3839206207861595, "ewc_loss": 0.049072265625, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 24.729148864746094, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8495001792907715, "num_tokens": 115239274.0, "step": 3018 }, { "epoch": 0.38404783106475004, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.03399085998535, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8577801585197449, "num_tokens": 115278330.0, "step": 3019 }, { "epoch": 0.3841750413433405, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.875141143798828, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.844912588596344, "num_tokens": 115321258.0, "step": 3020 }, { "epoch": 0.38430225162193105, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.770172119140625, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8507199287414551, "num_tokens": 115362609.0, "step": 3021 }, { "epoch": 0.3844294619005216, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.141254425048828, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.822352409362793, "num_tokens": 115405814.0, "step": 3022 }, { "epoch": 0.38455667217911205, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.866819381713867, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8436132669448853, "num_tokens": 115440856.0, "step": 3023 }, { "epoch": 0.3846838824577026, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.245317459106445, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8556609749794006, "num_tokens": 115482582.0, "step": 3024 }, { "epoch": 0.3848110927362931, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.570117950439453, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8396978378295898, "num_tokens": 115527268.0, "step": 3025 }, { "epoch": 0.3849383030148836, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.18916130065918, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8439691066741943, "num_tokens": 115563757.0, "step": 3026 }, { "epoch": 0.3850655132934741, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.830102920532227, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8526570200920105, "num_tokens": 115599629.0, "step": 3027 }, { "epoch": 0.38519272357206463, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.106910705566406, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.85237717628479, "num_tokens": 115634261.0, "step": 3028 }, { "epoch": 0.3853199338506551, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 24.998607635498047, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.861251175403595, "num_tokens": 115671150.0, "step": 3029 }, { "epoch": 0.38544714412924563, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.057371139526367, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8633561730384827, "num_tokens": 115714328.0, "step": 3030 }, { "epoch": 0.38557435440783616, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.077770233154297, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8648707270622253, "num_tokens": 115747118.0, "step": 3031 }, { "epoch": 0.3857015646864267, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.908023834228516, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.829646110534668, "num_tokens": 115790309.0, "step": 3032 }, { "epoch": 0.38582877496501716, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.227615356445312, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8619996309280396, "num_tokens": 115825630.0, "step": 3033 }, { "epoch": 0.3859559852436077, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.107654571533203, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8558768630027771, "num_tokens": 115858848.0, "step": 3034 }, { "epoch": 0.3860831955221982, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.10840606689453, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8581426739692688, "num_tokens": 115900024.0, "step": 3035 }, { "epoch": 0.3862104058007887, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.91083526611328, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.825192928314209, "num_tokens": 115938899.0, "step": 3036 }, { "epoch": 0.3863376160793792, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.19157600402832, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8565003871917725, "num_tokens": 115972199.0, "step": 3037 }, { "epoch": 0.38646482635796975, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.93512535095215, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8585577011108398, "num_tokens": 116011506.0, "step": 3038 }, { "epoch": 0.3865920366365602, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.838539123535156e-05, "grad_norm": 25.00586700439453, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8274983763694763, "num_tokens": 116051428.0, "step": 3039 }, { "epoch": 0.38671924691515075, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.937402725219727, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8583468198776245, "num_tokens": 116090307.0, "step": 3040 }, { "epoch": 0.3868464571937413, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.253475189208984, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8395111560821533, "num_tokens": 116123381.0, "step": 3041 }, { "epoch": 0.38697366747233175, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.733793258666992, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8524035215377808, "num_tokens": 116157755.0, "step": 3042 }, { "epoch": 0.3871008777509223, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 24.946455001831055, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8502392768859863, "num_tokens": 116197371.0, "step": 3043 }, { "epoch": 0.3872280880295128, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.243072509765625, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8351840972900391, "num_tokens": 116240544.0, "step": 3044 }, { "epoch": 0.3873552983081033, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.19675064086914, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8458636403083801, "num_tokens": 116272079.0, "step": 3045 }, { "epoch": 0.3874825085866938, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.888660430908203, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8467411994934082, "num_tokens": 116309641.0, "step": 3046 }, { "epoch": 0.38760971886528434, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.143131256103516, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8437985777854919, "num_tokens": 116344470.0, "step": 3047 }, { "epoch": 0.3877369291438748, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.181703567504883, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8527317047119141, "num_tokens": 116380177.0, "step": 3048 }, { "epoch": 0.38786413942246534, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.014516830444336, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8595889806747437, "num_tokens": 116416482.0, "step": 3049 }, { "epoch": 0.38799134970105587, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.1840877532959, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.837043285369873, "num_tokens": 116456103.0, "step": 3050 }, { "epoch": 0.38811855997964634, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.292421340942383, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8552801609039307, "num_tokens": 116496810.0, "step": 3051 }, { "epoch": 0.38824577025823687, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.03221893310547, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8462314009666443, "num_tokens": 116532999.0, "step": 3052 }, { "epoch": 0.3883729805368274, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.50702667236328, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8573300242424011, "num_tokens": 116565700.0, "step": 3053 }, { "epoch": 0.38850019081541787, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.753997802734375, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8543820381164551, "num_tokens": 116602798.0, "step": 3054 }, { "epoch": 0.3886274010940084, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.491119384765625, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.860914945602417, "num_tokens": 116638743.0, "step": 3055 }, { "epoch": 0.3887546113725989, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.106779098510742, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8612229824066162, "num_tokens": 116672290.0, "step": 3056 }, { "epoch": 0.3888818216511894, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.139169692993164, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8515332937240601, "num_tokens": 116712593.0, "step": 3057 }, { "epoch": 0.3890090319297799, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.26275634765625, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8480332493782043, "num_tokens": 116748663.0, "step": 3058 }, { "epoch": 0.38913624220837045, "ewc_loss": 0.04931640625, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.16060447692871, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8463384509086609, "num_tokens": 116796018.0, "step": 3059 }, { "epoch": 0.3892634524869609, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.147171020507812, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8465427756309509, "num_tokens": 116837287.0, "step": 3060 }, { "epoch": 0.38939066276555145, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.171215057373047, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8333714008331299, "num_tokens": 116872163.0, "step": 3061 }, { "epoch": 0.389517873044142, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.181564331054688, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8568294048309326, "num_tokens": 116913445.0, "step": 3062 }, { "epoch": 0.38964508332273246, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.13199806213379, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8409778475761414, "num_tokens": 116954469.0, "step": 3063 }, { "epoch": 0.389772293601323, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.147687911987305, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8487542867660522, "num_tokens": 116994339.0, "step": 3064 }, { "epoch": 0.3898995038799135, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.478885650634766, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8624111413955688, "num_tokens": 117030867.0, "step": 3065 }, { "epoch": 0.390026714158504, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 24.882707595825195, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.857504665851593, "num_tokens": 117068309.0, "step": 3066 }, { "epoch": 0.3901539244370945, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.354290008544922, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8454127907752991, "num_tokens": 117105278.0, "step": 3067 }, { "epoch": 0.39028113471568504, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.337038040161133, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8413002490997314, "num_tokens": 117143096.0, "step": 3068 }, { "epoch": 0.3904083449942755, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.332956314086914, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8514518141746521, "num_tokens": 117179638.0, "step": 3069 }, { "epoch": 0.39053555527286604, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.264694213867188, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.86397385597229, "num_tokens": 117211560.0, "step": 3070 }, { "epoch": 0.39066276555145657, "ewc_loss": 0.0498046875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.886222839355469e-05, "grad_norm": 25.06865119934082, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8425330519676208, "num_tokens": 117250622.0, "step": 3071 }, { "epoch": 0.39078997583004704, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.36547088623047, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8694322109222412, "num_tokens": 117287600.0, "step": 3072 }, { "epoch": 0.39091718610863757, "ewc_loss": 0.049560546875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.8623809814453125e-05, "grad_norm": 25.095932006835938, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8348597288131714, "num_tokens": 117328188.0, "step": 3073 }, { "epoch": 0.3910443963872281, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.593599319458008, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8577232360839844, "num_tokens": 117367282.0, "step": 3074 }, { "epoch": 0.39117160666581857, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.042461395263672, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.860450267791748, "num_tokens": 117403376.0, "step": 3075 }, { "epoch": 0.3912988169444091, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.38778305053711, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8388969302177429, "num_tokens": 117438661.0, "step": 3076 }, { "epoch": 0.39142602722299963, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.109394073486328, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.853752613067627, "num_tokens": 117477351.0, "step": 3077 }, { "epoch": 0.3915532375015901, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 24.936269760131836, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8473809361457825, "num_tokens": 117512773.0, "step": 3078 }, { "epoch": 0.39168044778018063, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.250225067138672, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8349941968917847, "num_tokens": 117555370.0, "step": 3079 }, { "epoch": 0.39180765805877116, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.155757904052734, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8577574491500854, "num_tokens": 117594208.0, "step": 3080 }, { "epoch": 0.3919348683373617, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.185089111328125, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8538349866867065, "num_tokens": 117632715.0, "step": 3081 }, { "epoch": 0.39206207861595216, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.38243293762207, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8462973833084106, "num_tokens": 117669221.0, "step": 3082 }, { "epoch": 0.3921892888945427, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.457561492919922, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.844458818435669, "num_tokens": 117709559.0, "step": 3083 }, { "epoch": 0.3923164991731332, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.235925674438477, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8576647043228149, "num_tokens": 117745240.0, "step": 3084 }, { "epoch": 0.3924437094517237, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.155345916748047, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8513027429580688, "num_tokens": 117787897.0, "step": 3085 }, { "epoch": 0.3925709197303142, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.232213973999023, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8468047380447388, "num_tokens": 117826862.0, "step": 3086 }, { "epoch": 0.39269813000890474, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.433406829833984, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.845514178276062, "num_tokens": 117866932.0, "step": 3087 }, { "epoch": 0.3928253402874952, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.418819427490234, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8516603708267212, "num_tokens": 117905538.0, "step": 3088 }, { "epoch": 0.39295255056608575, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.366554260253906, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8386568427085876, "num_tokens": 117945123.0, "step": 3089 }, { "epoch": 0.3930797608446763, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.172243118286133, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8522348403930664, "num_tokens": 117988743.0, "step": 3090 }, { "epoch": 0.39320697112326675, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.646760940551758, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8647075891494751, "num_tokens": 118026078.0, "step": 3091 }, { "epoch": 0.3933341814018573, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.118186950683594, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.861457347869873, "num_tokens": 118060105.0, "step": 3092 }, { "epoch": 0.3934613916804478, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.545583724975586, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8768547773361206, "num_tokens": 118089677.0, "step": 3093 }, { "epoch": 0.3935886019590383, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.39322280883789, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8632429242134094, "num_tokens": 118128449.0, "step": 3094 }, { "epoch": 0.3937158122376288, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.575162887573242, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8586301803588867, "num_tokens": 118168411.0, "step": 3095 }, { "epoch": 0.39384302251621933, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.36767578125, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8612510561943054, "num_tokens": 118206097.0, "step": 3096 }, { "epoch": 0.3939702327948098, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.528789520263672, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8636926412582397, "num_tokens": 118244429.0, "step": 3097 }, { "epoch": 0.39409744307340033, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.61211395263672, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8563603162765503, "num_tokens": 118279125.0, "step": 3098 }, { "epoch": 0.39422465335199086, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.4438419342041, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8469694256782532, "num_tokens": 118313156.0, "step": 3099 }, { "epoch": 0.39435186363058133, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.61185073852539, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8350003957748413, "num_tokens": 118349868.0, "step": 3100 }, { "epoch": 0.39447907390917186, "ewc_loss": 0.050048828125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.910064697265625e-05, "grad_norm": 25.313432693481445, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.840334951877594, "num_tokens": 118394998.0, "step": 3101 }, { "epoch": 0.3946062841877624, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.659181594848633, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8628383874893188, "num_tokens": 118436002.0, "step": 3102 }, { "epoch": 0.39473349446635286, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.31829071044922, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8557353019714355, "num_tokens": 118475094.0, "step": 3103 }, { "epoch": 0.3948607047449434, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.637048721313477, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8489894866943359, "num_tokens": 118514164.0, "step": 3104 }, { "epoch": 0.3949879150235339, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.23372459411621, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8459365367889404, "num_tokens": 118554271.0, "step": 3105 }, { "epoch": 0.3951151253021244, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.3610782623291, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8518170714378357, "num_tokens": 118597123.0, "step": 3106 }, { "epoch": 0.3952423355807149, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.53614616394043, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.844719648361206, "num_tokens": 118636683.0, "step": 3107 }, { "epoch": 0.39536954585930545, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.33087158203125, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8524749279022217, "num_tokens": 118665770.0, "step": 3108 }, { "epoch": 0.3954967561378959, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.40644645690918, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8606302738189697, "num_tokens": 118701652.0, "step": 3109 }, { "epoch": 0.39562396641648645, "ewc_loss": 0.05029296875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.933906555175781e-05, "grad_norm": 25.360492706298828, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8575469255447388, "num_tokens": 118741756.0, "step": 3110 }, { "epoch": 0.395751176695077, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.290220260620117, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8511159420013428, "num_tokens": 118784320.0, "step": 3111 }, { "epoch": 0.39587838697366745, "ewc_loss": 0.050537109375, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.9577484130859375e-05, "grad_norm": 25.133541107177734, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8440555334091187, "num_tokens": 118821456.0, "step": 3112 }, { "epoch": 0.396005597252258, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.543102264404297, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8489128351211548, "num_tokens": 118861746.0, "step": 3113 }, { "epoch": 0.3961328075308485, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.266677856445312, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.855156421661377, "num_tokens": 118900346.0, "step": 3114 }, { "epoch": 0.396260017809439, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.572223663330078, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8504339456558228, "num_tokens": 118939707.0, "step": 3115 }, { "epoch": 0.3963872280880295, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.117746353149414, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8507422804832458, "num_tokens": 118971154.0, "step": 3116 }, { "epoch": 0.39651443836662004, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.677413940429688, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8351431488990784, "num_tokens": 119012344.0, "step": 3117 }, { "epoch": 0.3966416486452105, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.342294692993164, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8499715924263, "num_tokens": 119047677.0, "step": 3118 }, { "epoch": 0.39676885892380104, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.38996124267578, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.86518394947052, "num_tokens": 119086357.0, "step": 3119 }, { "epoch": 0.39689606920239157, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.5997371673584, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8571172952651978, "num_tokens": 119122854.0, "step": 3120 }, { "epoch": 0.39702327948098204, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.348072052001953, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8571731448173523, "num_tokens": 119155449.0, "step": 3121 }, { "epoch": 0.39715048975957257, "ewc_loss": 0.051025390625, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.00543212890625e-05, "grad_norm": 25.716684341430664, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.845977783203125, "num_tokens": 119197287.0, "step": 3122 }, { "epoch": 0.3972777000381631, "ewc_loss": 0.05078125, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 3.981590270996094e-05, "grad_norm": 25.1086483001709, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8383519649505615, "num_tokens": 119235749.0, "step": 3123 }, { "epoch": 0.39740491031675357, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.52581024169922, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8462868332862854, "num_tokens": 119267282.0, "step": 3124 }, { "epoch": 0.3975321205953441, "ewc_loss": 0.051025390625, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.00543212890625e-05, "grad_norm": 25.40082359313965, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8422495126724243, "num_tokens": 119304387.0, "step": 3125 }, { "epoch": 0.3976593308739346, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.128633499145508, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8400619029998779, "num_tokens": 119341470.0, "step": 3126 }, { "epoch": 0.3977865411525251, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.394901275634766, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8394545912742615, "num_tokens": 119381067.0, "step": 3127 }, { "epoch": 0.3979137514311156, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.133098602294922, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8512254953384399, "num_tokens": 119415680.0, "step": 3128 }, { "epoch": 0.39804096170970615, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.344533920288086, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.841678261756897, "num_tokens": 119454300.0, "step": 3129 }, { "epoch": 0.3981681719882967, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.3337345123291, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.863448977470398, "num_tokens": 119485599.0, "step": 3130 }, { "epoch": 0.39829538226688715, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.4932918548584, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8368855714797974, "num_tokens": 119525820.0, "step": 3131 }, { "epoch": 0.3984225925454777, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.6389102935791, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8574202060699463, "num_tokens": 119568216.0, "step": 3132 }, { "epoch": 0.3985498028240682, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.55168914794922, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8376332521438599, "num_tokens": 119602914.0, "step": 3133 }, { "epoch": 0.3986770131026587, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.200241088867188, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8475666642189026, "num_tokens": 119637024.0, "step": 3134 }, { "epoch": 0.3988042233812492, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 24.956954956054688, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8419404625892639, "num_tokens": 119679265.0, "step": 3135 }, { "epoch": 0.39893143365983974, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.549293518066406, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8420032262802124, "num_tokens": 119716254.0, "step": 3136 }, { "epoch": 0.3990586439384302, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 24.967792510986328, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8419699668884277, "num_tokens": 119761325.0, "step": 3137 }, { "epoch": 0.39918585421702074, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.41476821899414, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8742374181747437, "num_tokens": 119795468.0, "step": 3138 }, { "epoch": 0.39931306449561127, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 24.994787216186523, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8409050703048706, "num_tokens": 119833804.0, "step": 3139 }, { "epoch": 0.39944027477420174, "ewc_loss": 0.052490234375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.51544189453125, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8465394377708435, "num_tokens": 119870852.0, "step": 3140 }, { "epoch": 0.39956748505279227, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.048381805419922, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8534027338027954, "num_tokens": 119909258.0, "step": 3141 }, { "epoch": 0.3996946953313828, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.29390525817871, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8390693068504333, "num_tokens": 119940701.0, "step": 3142 }, { "epoch": 0.39982190560997327, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.336421966552734, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8502254486083984, "num_tokens": 119976780.0, "step": 3143 }, { "epoch": 0.3999491158885638, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 24.97216796875, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.841415524482727, "num_tokens": 120017543.0, "step": 3144 }, { "epoch": 0.40007632616715433, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.392663955688477, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8478562235832214, "num_tokens": 120050036.0, "step": 3145 }, { "epoch": 0.4002035364457448, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.400609970092773, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8640365600585938, "num_tokens": 120084582.0, "step": 3146 }, { "epoch": 0.40033074672433533, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.117345809936523, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.845791757106781, "num_tokens": 120120218.0, "step": 3147 }, { "epoch": 0.40045795700292586, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.323965072631836, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8709758520126343, "num_tokens": 120161650.0, "step": 3148 }, { "epoch": 0.40058516728151633, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.256284713745117, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8590789437294006, "num_tokens": 120197409.0, "step": 3149 }, { "epoch": 0.40071237756010686, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.1599178314209, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8386425971984863, "num_tokens": 120240700.0, "step": 3150 }, { "epoch": 0.4008395878386974, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.174514770507812, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.840315043926239, "num_tokens": 120286741.0, "step": 3151 }, { "epoch": 0.40096679811728786, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.351245880126953, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8529278635978699, "num_tokens": 120323236.0, "step": 3152 }, { "epoch": 0.4010940083958784, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.373449325561523, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8484848737716675, "num_tokens": 120362185.0, "step": 3153 }, { "epoch": 0.4012212186744689, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.24639320373535, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.841285228729248, "num_tokens": 120397524.0, "step": 3154 }, { "epoch": 0.4013484289530594, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.50749397277832, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8542767763137817, "num_tokens": 120439723.0, "step": 3155 }, { "epoch": 0.4014756392316499, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.165658950805664, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8545384407043457, "num_tokens": 120483693.0, "step": 3156 }, { "epoch": 0.40160284951024044, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.42183494567871, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8595711588859558, "num_tokens": 120520558.0, "step": 3157 }, { "epoch": 0.4017300597888309, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.164287567138672, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8598523139953613, "num_tokens": 120557901.0, "step": 3158 }, { "epoch": 0.40185727006742145, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.319616317749023, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8560455441474915, "num_tokens": 120593851.0, "step": 3159 }, { "epoch": 0.401984480346012, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.279726028442383, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8445767760276794, "num_tokens": 120624533.0, "step": 3160 }, { "epoch": 0.40211169062460245, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.1747989654541, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8449481725692749, "num_tokens": 120660820.0, "step": 3161 }, { "epoch": 0.402238900903193, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.29315185546875, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8621318340301514, "num_tokens": 120701485.0, "step": 3162 }, { "epoch": 0.4023661111817835, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.456430435180664, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8587396144866943, "num_tokens": 120736677.0, "step": 3163 }, { "epoch": 0.402493321460374, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.349742889404297, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.847097635269165, "num_tokens": 120773933.0, "step": 3164 }, { "epoch": 0.4026205317389645, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.509328842163086, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8547552227973938, "num_tokens": 120804671.0, "step": 3165 }, { "epoch": 0.40274774201755503, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.332029342651367, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8537983298301697, "num_tokens": 120837042.0, "step": 3166 }, { "epoch": 0.4028749522961455, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.41270637512207, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8477940559387207, "num_tokens": 120875127.0, "step": 3167 }, { "epoch": 0.40300216257473603, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.12782096862793, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8495404720306396, "num_tokens": 120915937.0, "step": 3168 }, { "epoch": 0.40312937285332656, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.072864532470703, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8350738883018494, "num_tokens": 120953334.0, "step": 3169 }, { "epoch": 0.40325658313191703, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.252090454101562, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8499363660812378, "num_tokens": 120986674.0, "step": 3170 }, { "epoch": 0.40338379341050756, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.593534469604492, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8565787076950073, "num_tokens": 121026305.0, "step": 3171 }, { "epoch": 0.4035110036890981, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 24.9078369140625, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8640293478965759, "num_tokens": 121061644.0, "step": 3172 }, { "epoch": 0.40363821396768856, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.790653228759766, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8470086455345154, "num_tokens": 121102860.0, "step": 3173 }, { "epoch": 0.4037654242462791, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.0947322845459, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8643243312835693, "num_tokens": 121136566.0, "step": 3174 }, { "epoch": 0.4038926345248696, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.216205596923828, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8606158494949341, "num_tokens": 121173759.0, "step": 3175 }, { "epoch": 0.4040198448034601, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.398006439208984, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.850579023361206, "num_tokens": 121219881.0, "step": 3176 }, { "epoch": 0.4041470550820506, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.357681274414062, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8493404388427734, "num_tokens": 121258719.0, "step": 3177 }, { "epoch": 0.40427426536064115, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.211544036865234, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8282727003097534, "num_tokens": 121298013.0, "step": 3178 }, { "epoch": 0.4044014756392316, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.522275924682617, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8622195720672607, "num_tokens": 121329615.0, "step": 3179 }, { "epoch": 0.40452868591782215, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.340126037597656, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8610169887542725, "num_tokens": 121365199.0, "step": 3180 }, { "epoch": 0.4046558961964127, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.228092193603516, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8466711044311523, "num_tokens": 121402393.0, "step": 3181 }, { "epoch": 0.4047831064750032, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.330596923828125, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8496242761611938, "num_tokens": 121439678.0, "step": 3182 }, { "epoch": 0.4049103167535937, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.36664581298828, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8455888032913208, "num_tokens": 121480781.0, "step": 3183 }, { "epoch": 0.4050375270321842, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.558198928833008, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.858087956905365, "num_tokens": 121517614.0, "step": 3184 }, { "epoch": 0.40516473731077474, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.560203552246094, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8475174903869629, "num_tokens": 121552225.0, "step": 3185 }, { "epoch": 0.4052919475893652, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.23436164855957, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8449024558067322, "num_tokens": 121587177.0, "step": 3186 }, { "epoch": 0.40541915786795574, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.319459915161133, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8338585495948792, "num_tokens": 121619547.0, "step": 3187 }, { "epoch": 0.40554636814654627, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.599384307861328, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8577971458435059, "num_tokens": 121657396.0, "step": 3188 }, { "epoch": 0.40567357842513674, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.205474853515625, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8326025009155273, "num_tokens": 121691867.0, "step": 3189 }, { "epoch": 0.40580078870372727, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.583011627197266, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8441969156265259, "num_tokens": 121723546.0, "step": 3190 }, { "epoch": 0.4059279989823178, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.91220474243164, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8494609594345093, "num_tokens": 121764589.0, "step": 3191 }, { "epoch": 0.40605520926090827, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.36602783203125, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8395304083824158, "num_tokens": 121803281.0, "step": 3192 }, { "epoch": 0.4061824195394988, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 26.071182250976562, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8370109796524048, "num_tokens": 121841710.0, "step": 3193 }, { "epoch": 0.4063096298180893, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.60739517211914, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8487159609794617, "num_tokens": 121881979.0, "step": 3194 }, { "epoch": 0.4064368400966798, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 26.682262420654297, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8547109961509705, "num_tokens": 121921599.0, "step": 3195 }, { "epoch": 0.4065640503752703, "ewc_loss": 0.05126953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.00543212890625e-05, "grad_norm": 25.250919342041016, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8605890274047852, "num_tokens": 121960545.0, "step": 3196 }, { "epoch": 0.40669126065386085, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.448440551757812, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.831993579864502, "num_tokens": 121999259.0, "step": 3197 }, { "epoch": 0.4068184709324513, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.787139892578125, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8646458387374878, "num_tokens": 122039066.0, "step": 3198 }, { "epoch": 0.40694568121104185, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.372222900390625, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8595407009124756, "num_tokens": 122085035.0, "step": 3199 }, { "epoch": 0.4070728914896324, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.57317543029785, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8645712733268738, "num_tokens": 122128007.0, "step": 3200 }, { "epoch": 0.40720010176822286, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.3956356048584, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8481005430221558, "num_tokens": 122169623.0, "step": 3201 }, { "epoch": 0.4073273120468134, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 25.83635711669922, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8445380330085754, "num_tokens": 122201755.0, "step": 3202 }, { "epoch": 0.4074545223254039, "ewc_loss": 0.0517578125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.0531158447265625e-05, "grad_norm": 25.05517578125, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8610734343528748, "num_tokens": 122233680.0, "step": 3203 }, { "epoch": 0.4075817326039944, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.777921676635742, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8487148880958557, "num_tokens": 122265107.0, "step": 3204 }, { "epoch": 0.4077089428825849, "ewc_loss": 0.051513671875, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.029273986816406e-05, "grad_norm": 24.822961807250977, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.855074942111969, "num_tokens": 122302750.0, "step": 3205 }, { "epoch": 0.40783615316117544, "ewc_loss": 0.052490234375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.766027450561523, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8405387997627258, "num_tokens": 122346820.0, "step": 3206 }, { "epoch": 0.4079633634397659, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.158138275146484, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8554831743240356, "num_tokens": 122383254.0, "step": 3207 }, { "epoch": 0.40809057371835644, "ewc_loss": 0.052490234375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.955358505249023, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8626453876495361, "num_tokens": 122416693.0, "step": 3208 }, { "epoch": 0.40821778399694697, "ewc_loss": 0.052001953125, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.076957702636719e-05, "grad_norm": 25.04458236694336, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8551570177078247, "num_tokens": 122452403.0, "step": 3209 }, { "epoch": 0.40834499427553744, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.706865310668945, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8397341966629028, "num_tokens": 122489685.0, "step": 3210 }, { "epoch": 0.40847220455412797, "ewc_loss": 0.05224609375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.100799560546875e-05, "grad_norm": 25.238758087158203, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.832873523235321, "num_tokens": 122529079.0, "step": 3211 }, { "epoch": 0.4085994148327185, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.481931686401367, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8449265956878662, "num_tokens": 122569933.0, "step": 3212 }, { "epoch": 0.40872662511130897, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.331085205078125, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8314424157142639, "num_tokens": 122607884.0, "step": 3213 }, { "epoch": 0.4088538353898995, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.617753982543945, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8512569665908813, "num_tokens": 122651968.0, "step": 3214 }, { "epoch": 0.40898104566849003, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.510923385620117, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8542520999908447, "num_tokens": 122688680.0, "step": 3215 }, { "epoch": 0.4091082559470805, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.561330795288086, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8561056852340698, "num_tokens": 122728271.0, "step": 3216 }, { "epoch": 0.40923546622567103, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.492782592773438, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8489265441894531, "num_tokens": 122759691.0, "step": 3217 }, { "epoch": 0.40936267650426156, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.346256256103516, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.849044680595398, "num_tokens": 122800185.0, "step": 3218 }, { "epoch": 0.40948988678285203, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.502761840820312, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8595544695854187, "num_tokens": 122839548.0, "step": 3219 }, { "epoch": 0.40961709706144256, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.329811096191406, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8359569311141968, "num_tokens": 122879621.0, "step": 3220 }, { "epoch": 0.4097443073400331, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.409305572509766, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8451956510543823, "num_tokens": 122921495.0, "step": 3221 }, { "epoch": 0.40987151761862356, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.297677993774414, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8575690984725952, "num_tokens": 122957197.0, "step": 3222 }, { "epoch": 0.4099987278972141, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.511863708496094, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8317866325378418, "num_tokens": 122999999.0, "step": 3223 }, { "epoch": 0.4101259381758046, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.74193572998047, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8551667928695679, "num_tokens": 123040402.0, "step": 3224 }, { "epoch": 0.4102531484543951, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.293561935424805, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8546322584152222, "num_tokens": 123080086.0, "step": 3225 }, { "epoch": 0.4103803587329856, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.343494415283203, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8524124622344971, "num_tokens": 123120058.0, "step": 3226 }, { "epoch": 0.41050756901157615, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.21426010131836, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8513423800468445, "num_tokens": 123162003.0, "step": 3227 }, { "epoch": 0.4106347792901666, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.286911010742188, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8184773921966553, "num_tokens": 123202249.0, "step": 3228 }, { "epoch": 0.41076198956875715, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.27537727355957, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8452355861663818, "num_tokens": 123237401.0, "step": 3229 }, { "epoch": 0.4108891998473477, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 26.64803123474121, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8663583993911743, "num_tokens": 123269364.0, "step": 3230 }, { "epoch": 0.4110164101259382, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.124282836914062, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8535290956497192, "num_tokens": 123307743.0, "step": 3231 }, { "epoch": 0.4111436204045287, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.188142776489258, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8556074500083923, "num_tokens": 123340102.0, "step": 3232 }, { "epoch": 0.4112708306831192, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.804025650024414, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8461601734161377, "num_tokens": 123374956.0, "step": 3233 }, { "epoch": 0.41139804096170973, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.306148529052734, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8604771494865417, "num_tokens": 123418070.0, "step": 3234 }, { "epoch": 0.4115252512403002, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.408998489379883, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8496192693710327, "num_tokens": 123453183.0, "step": 3235 }, { "epoch": 0.41165246151889073, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 26.007204055786133, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8508446216583252, "num_tokens": 123491526.0, "step": 3236 }, { "epoch": 0.41177967179748126, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.475736618041992, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8524926900863647, "num_tokens": 123529743.0, "step": 3237 }, { "epoch": 0.41190688207607173, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.178895950317383, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8543541431427002, "num_tokens": 123573402.0, "step": 3238 }, { "epoch": 0.41203409235466226, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.72529411315918, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.847363293170929, "num_tokens": 123604167.0, "step": 3239 }, { "epoch": 0.4121613026332528, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.500991821289062, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8473756313323975, "num_tokens": 123648104.0, "step": 3240 }, { "epoch": 0.41228851291184326, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.299875259399414, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8406999111175537, "num_tokens": 123689678.0, "step": 3241 }, { "epoch": 0.4124157231904338, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.26659393310547, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8588272929191589, "num_tokens": 123729257.0, "step": 3242 }, { "epoch": 0.4125429334690243, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.58939552307129, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8532479405403137, "num_tokens": 123772251.0, "step": 3243 }, { "epoch": 0.4126701437476148, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.61322593688965, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8642137050628662, "num_tokens": 123810345.0, "step": 3244 }, { "epoch": 0.4127973540262053, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.404993057250977, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8474904894828796, "num_tokens": 123855276.0, "step": 3245 }, { "epoch": 0.41292456430479585, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.47340965270996, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8536006808280945, "num_tokens": 123896613.0, "step": 3246 }, { "epoch": 0.4130517745833863, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.712369918823242, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.834686279296875, "num_tokens": 123940713.0, "step": 3247 }, { "epoch": 0.41317898486197685, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.59436798095703, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8399617075920105, "num_tokens": 123981485.0, "step": 3248 }, { "epoch": 0.4133061951405674, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.3001708984375, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8508442640304565, "num_tokens": 124018159.0, "step": 3249 }, { "epoch": 0.41343340541915785, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.52444839477539, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8606163263320923, "num_tokens": 124049911.0, "step": 3250 }, { "epoch": 0.4135606156977484, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.6242733001709, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8717387318611145, "num_tokens": 124084260.0, "step": 3251 }, { "epoch": 0.4136878259763389, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.728439331054688, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8543553352355957, "num_tokens": 124122741.0, "step": 3252 }, { "epoch": 0.4138150362549294, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.33995819091797, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8475020527839661, "num_tokens": 124162793.0, "step": 3253 }, { "epoch": 0.4139422465335199, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.630775451660156, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8430548906326294, "num_tokens": 124206889.0, "step": 3254 }, { "epoch": 0.41406945681211044, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.644577026367188, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8618409037590027, "num_tokens": 124244551.0, "step": 3255 }, { "epoch": 0.4141966670907009, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.415189743041992, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8541929721832275, "num_tokens": 124290485.0, "step": 3256 }, { "epoch": 0.41432387736929144, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.510753631591797, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8579621315002441, "num_tokens": 124323763.0, "step": 3257 }, { "epoch": 0.41445108764788197, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.495084762573242, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8523204922676086, "num_tokens": 124363318.0, "step": 3258 }, { "epoch": 0.41457829792647244, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.324674606323242, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8745802044868469, "num_tokens": 124400845.0, "step": 3259 }, { "epoch": 0.41470550820506297, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.950328826904297, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8580803871154785, "num_tokens": 124442546.0, "step": 3260 }, { "epoch": 0.4148327184836535, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.234289169311523, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8469882011413574, "num_tokens": 124477737.0, "step": 3261 }, { "epoch": 0.41495992876224397, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.896177291870117, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8379296660423279, "num_tokens": 124518081.0, "step": 3262 }, { "epoch": 0.4150871390408345, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.305313110351562, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8415679931640625, "num_tokens": 124556361.0, "step": 3263 }, { "epoch": 0.415214349319425, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 26.031896591186523, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8504388928413391, "num_tokens": 124588490.0, "step": 3264 }, { "epoch": 0.4153415595980155, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.576475143432617, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8490987420082092, "num_tokens": 124624656.0, "step": 3265 }, { "epoch": 0.415468769876606, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.744873046875, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8369631767272949, "num_tokens": 124660681.0, "step": 3266 }, { "epoch": 0.41559598015519655, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.853660583496094, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8523281812667847, "num_tokens": 124703448.0, "step": 3267 }, { "epoch": 0.415723190433787, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.698902130126953, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8489992022514343, "num_tokens": 124742078.0, "step": 3268 }, { "epoch": 0.41585040071237755, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.80172348022461, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8381537795066833, "num_tokens": 124779894.0, "step": 3269 }, { "epoch": 0.4159776109909681, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.6796817779541, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8467224836349487, "num_tokens": 124817918.0, "step": 3270 }, { "epoch": 0.41610482126955856, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.86834144592285, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.867552638053894, "num_tokens": 124854839.0, "step": 3271 }, { "epoch": 0.4162320315481491, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.854354858398438, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8357211351394653, "num_tokens": 124886779.0, "step": 3272 }, { "epoch": 0.4163592418267396, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.655973434448242, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8598132729530334, "num_tokens": 124927919.0, "step": 3273 }, { "epoch": 0.4164864521053301, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.874483108520508, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.848438560962677, "num_tokens": 124966475.0, "step": 3274 }, { "epoch": 0.4166136623839206, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.626985549926758, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8263341188430786, "num_tokens": 125004110.0, "step": 3275 }, { "epoch": 0.41674087266251114, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.670862197875977, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8652527928352356, "num_tokens": 125043164.0, "step": 3276 }, { "epoch": 0.4168680829411016, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.62677574157715, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8542702198028564, "num_tokens": 125084255.0, "step": 3277 }, { "epoch": 0.41699529321969214, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.544395446777344, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.865058422088623, "num_tokens": 125124971.0, "step": 3278 }, { "epoch": 0.41712250349828267, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.729745864868164, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8473066091537476, "num_tokens": 125161470.0, "step": 3279 }, { "epoch": 0.4172497137768732, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.760047912597656, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8348410725593567, "num_tokens": 125199117.0, "step": 3280 }, { "epoch": 0.41737692405546367, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.820606231689453, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8323333859443665, "num_tokens": 125238136.0, "step": 3281 }, { "epoch": 0.4175041343340542, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.435251235961914, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8486452102661133, "num_tokens": 125281041.0, "step": 3282 }, { "epoch": 0.41763134461264473, "ewc_loss": 0.053955078125, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.797422409057617, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8530662059783936, "num_tokens": 125314739.0, "step": 3283 }, { "epoch": 0.4177585548912352, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.899253845214844, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8434563875198364, "num_tokens": 125353129.0, "step": 3284 }, { "epoch": 0.41788576516982573, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.683191299438477, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8500491380691528, "num_tokens": 125394092.0, "step": 3285 }, { "epoch": 0.41801297544841626, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.86519432067871, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8443713188171387, "num_tokens": 125438262.0, "step": 3286 }, { "epoch": 0.41814018572700673, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.51953887939453, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8487157225608826, "num_tokens": 125477842.0, "step": 3287 }, { "epoch": 0.41826739600559726, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.834264755249023, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8621808290481567, "num_tokens": 125517236.0, "step": 3288 }, { "epoch": 0.4183946062841878, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.618913650512695, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8436130881309509, "num_tokens": 125561598.0, "step": 3289 }, { "epoch": 0.41852181656277826, "ewc_loss": 0.053955078125, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.850561141967773, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8482259511947632, "num_tokens": 125599104.0, "step": 3290 }, { "epoch": 0.4186490268413688, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.487192153930664, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8505247235298157, "num_tokens": 125635286.0, "step": 3291 }, { "epoch": 0.4187762371199593, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.843751907348633, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8547346591949463, "num_tokens": 125669534.0, "step": 3292 }, { "epoch": 0.4189034473985498, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.616533279418945, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8495601415634155, "num_tokens": 125707966.0, "step": 3293 }, { "epoch": 0.4190306576771403, "ewc_loss": 0.053955078125, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 26.113475799560547, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8441777229309082, "num_tokens": 125741448.0, "step": 3294 }, { "epoch": 0.41915786795573085, "ewc_loss": 0.053955078125, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.851356506347656, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8445738554000854, "num_tokens": 125776887.0, "step": 3295 }, { "epoch": 0.4192850782343213, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.934064865112305, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8551781177520752, "num_tokens": 125807899.0, "step": 3296 }, { "epoch": 0.41941228851291185, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 26.201492309570312, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8587079644203186, "num_tokens": 125847711.0, "step": 3297 }, { "epoch": 0.4195394987915024, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.675031661987305, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8638505935668945, "num_tokens": 125883248.0, "step": 3298 }, { "epoch": 0.41966670907009285, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.9599609375, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8385388851165771, "num_tokens": 125925771.0, "step": 3299 }, { "epoch": 0.4197939193486834, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.692363739013672, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8367146849632263, "num_tokens": 125968815.0, "step": 3300 }, { "epoch": 0.4199211296272739, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.90269660949707, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8440878987312317, "num_tokens": 126007761.0, "step": 3301 }, { "epoch": 0.4200483399058644, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 26.19413185119629, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8326457738876343, "num_tokens": 126047872.0, "step": 3302 }, { "epoch": 0.4201755501844549, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.672914505004883, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8565911054611206, "num_tokens": 126081273.0, "step": 3303 }, { "epoch": 0.42030276046304543, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.049989700317383, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8363940715789795, "num_tokens": 126121590.0, "step": 3304 }, { "epoch": 0.4204299707416359, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.611282348632812, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8327920436859131, "num_tokens": 126168396.0, "step": 3305 }, { "epoch": 0.42055718102022643, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.953264236450195, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8451071977615356, "num_tokens": 126205220.0, "step": 3306 }, { "epoch": 0.42068439129881696, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.19091796875, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8432244658470154, "num_tokens": 126239131.0, "step": 3307 }, { "epoch": 0.42081160157740743, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.89020538330078, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.851563572883606, "num_tokens": 126271931.0, "step": 3308 }, { "epoch": 0.42093881185599796, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.71021842956543, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8588576912879944, "num_tokens": 126307730.0, "step": 3309 }, { "epoch": 0.4210660221345885, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 26.39292335510254, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8538385033607483, "num_tokens": 126344528.0, "step": 3310 }, { "epoch": 0.42119323241317896, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.828529357910156, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8581511974334717, "num_tokens": 126377260.0, "step": 3311 }, { "epoch": 0.4213204426917695, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.956933975219727, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8467826247215271, "num_tokens": 126409723.0, "step": 3312 }, { "epoch": 0.42144765297036, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 26.09065818786621, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8536235094070435, "num_tokens": 126450929.0, "step": 3313 }, { "epoch": 0.4215748632489505, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.80060386657715, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8600969314575195, "num_tokens": 126486878.0, "step": 3314 }, { "epoch": 0.421702073527541, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.851585388183594, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8392413854598999, "num_tokens": 126522884.0, "step": 3315 }, { "epoch": 0.42182928380613155, "ewc_loss": 0.052978515625, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 26.05668830871582, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8513892292976379, "num_tokens": 126560848.0, "step": 3316 }, { "epoch": 0.421956494084722, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.802837371826172, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8520771265029907, "num_tokens": 126592947.0, "step": 3317 }, { "epoch": 0.42208370436331255, "ewc_loss": 0.053466796875, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.10000991821289, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8613148927688599, "num_tokens": 126630631.0, "step": 3318 }, { "epoch": 0.4222109146419031, "ewc_loss": 0.052734375, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 4.124641418457031e-05, "grad_norm": 25.407495498657227, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8457983732223511, "num_tokens": 126673395.0, "step": 3319 }, { "epoch": 0.42233812492049355, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 26.03999137878418, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8669981956481934, "num_tokens": 126711504.0, "step": 3320 }, { "epoch": 0.4224653351990841, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.172325134277344e-05, "grad_norm": 25.51577377319336, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.863364577293396, "num_tokens": 126753217.0, "step": 3321 }, { "epoch": 0.4225925454776746, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 26.075223922729492, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8565791249275208, "num_tokens": 126792982.0, "step": 3322 }, { "epoch": 0.4227197557562651, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.606584548950195, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8432605266571045, "num_tokens": 126823771.0, "step": 3323 }, { "epoch": 0.4228469660348556, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 26.087779998779297, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8540052175521851, "num_tokens": 126867137.0, "step": 3324 }, { "epoch": 0.42297417631344614, "ewc_loss": 0.05322265625, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.1484832763671875e-05, "grad_norm": 25.63593101501465, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8563680648803711, "num_tokens": 126902680.0, "step": 3325 }, { "epoch": 0.4231013865920366, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.386255264282227, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8630526661872864, "num_tokens": 126941037.0, "step": 3326 }, { "epoch": 0.42322859687062714, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.780960083007812, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8530445098876953, "num_tokens": 126980009.0, "step": 3327 }, { "epoch": 0.42335580714921767, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 26.039012908935547, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8637526035308838, "num_tokens": 127020022.0, "step": 3328 }, { "epoch": 0.42348301742780814, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.71484375, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8326310515403748, "num_tokens": 127060500.0, "step": 3329 }, { "epoch": 0.42361022770639867, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.828523635864258, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8482596278190613, "num_tokens": 127095886.0, "step": 3330 }, { "epoch": 0.4237374379849892, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.928159713745117, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8426434993743896, "num_tokens": 127134334.0, "step": 3331 }, { "epoch": 0.4238646482635797, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.862043380737305, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8431189060211182, "num_tokens": 127174288.0, "step": 3332 }, { "epoch": 0.4239918585421702, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.559478759765625, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8487813472747803, "num_tokens": 127210311.0, "step": 3333 }, { "epoch": 0.4241190688207607, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.904193878173828, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8714856505393982, "num_tokens": 127252813.0, "step": 3334 }, { "epoch": 0.42424627909935125, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.849533081054688, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8388026356697083, "num_tokens": 127285847.0, "step": 3335 }, { "epoch": 0.4243734893779417, "ewc_loss": 0.0537109375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.1961669921875e-05, "grad_norm": 25.772262573242188, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8655686974525452, "num_tokens": 127325360.0, "step": 3336 }, { "epoch": 0.42450069965653225, "ewc_loss": 0.053955078125, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.220008850097656e-05, "grad_norm": 25.982046127319336, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8551888465881348, "num_tokens": 127366446.0, "step": 3337 }, { "epoch": 0.4246279099351228, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.5743465423584, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8601112365722656, "num_tokens": 127402608.0, "step": 3338 }, { "epoch": 0.42475512021371326, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 25.892948150634766, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8566712141036987, "num_tokens": 127445355.0, "step": 3339 }, { "epoch": 0.4248823304923038, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.76844024658203, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.846196174621582, "num_tokens": 127480523.0, "step": 3340 }, { "epoch": 0.4250095407708943, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 26.002002716064453, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8708951473236084, "num_tokens": 127520787.0, "step": 3341 }, { "epoch": 0.4251367510494848, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.738428115844727, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8463245034217834, "num_tokens": 127565576.0, "step": 3342 }, { "epoch": 0.4252639613280753, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.758562088012695, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8621962070465088, "num_tokens": 127599144.0, "step": 3343 }, { "epoch": 0.42539117160666584, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.638803482055664, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8707758188247681, "num_tokens": 127637361.0, "step": 3344 }, { "epoch": 0.4255183818852563, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 26.001449584960938, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8416712880134583, "num_tokens": 127675139.0, "step": 3345 }, { "epoch": 0.42564559216384684, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.465322494506836, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8576362133026123, "num_tokens": 127718822.0, "step": 3346 }, { "epoch": 0.42577280244243737, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.227272033691406, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8557332754135132, "num_tokens": 127763332.0, "step": 3347 }, { "epoch": 0.42590001272102784, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.75113296508789, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8633293509483337, "num_tokens": 127800346.0, "step": 3348 }, { "epoch": 0.42602722299961837, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.091440200805664, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8487469553947449, "num_tokens": 127845142.0, "step": 3349 }, { "epoch": 0.4261544332782089, "ewc_loss": 0.05419921875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 25.820316314697266, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8337397575378418, "num_tokens": 127881131.0, "step": 3350 }, { "epoch": 0.4262816435567994, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.015897750854492, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8653103113174438, "num_tokens": 127917941.0, "step": 3351 }, { "epoch": 0.4264088538353899, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.858057022094727, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8383383750915527, "num_tokens": 127963345.0, "step": 3352 }, { "epoch": 0.42653606411398043, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 25.6922664642334, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8608030080795288, "num_tokens": 128004839.0, "step": 3353 }, { "epoch": 0.4266632743925709, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 25.74944305419922, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8444911241531372, "num_tokens": 128042709.0, "step": 3354 }, { "epoch": 0.42679048467116143, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.781518936157227, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.836119532585144, "num_tokens": 128085069.0, "step": 3355 }, { "epoch": 0.42691769494975196, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 25.947404861450195, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8463733792304993, "num_tokens": 128124300.0, "step": 3356 }, { "epoch": 0.42704490522834243, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.650724411010742, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8493884801864624, "num_tokens": 128164410.0, "step": 3357 }, { "epoch": 0.42717211550693296, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.80500602722168, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8618293404579163, "num_tokens": 128206524.0, "step": 3358 }, { "epoch": 0.4272993257855235, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.038740158081055, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8426398038864136, "num_tokens": 128246387.0, "step": 3359 }, { "epoch": 0.42742653606411396, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.6547908782959, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8622979521751404, "num_tokens": 128287794.0, "step": 3360 }, { "epoch": 0.4275537463427045, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.84064292907715, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8482720851898193, "num_tokens": 128328467.0, "step": 3361 }, { "epoch": 0.427680956621295, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.67066192626953, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8597975969314575, "num_tokens": 128364319.0, "step": 3362 }, { "epoch": 0.4278081668998855, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.914731979370117, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8475735783576965, "num_tokens": 128403892.0, "step": 3363 }, { "epoch": 0.427935377178476, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.9440975189209, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8325321078300476, "num_tokens": 128446760.0, "step": 3364 }, { "epoch": 0.42806258745706655, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.778682708740234, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8268910646438599, "num_tokens": 128485498.0, "step": 3365 }, { "epoch": 0.428189797735657, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.99234962463379, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8405531644821167, "num_tokens": 128527871.0, "step": 3366 }, { "epoch": 0.42831700801424755, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.687580108642578, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8380893468856812, "num_tokens": 128567792.0, "step": 3367 }, { "epoch": 0.4284442182928381, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.802949905395508, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8425500392913818, "num_tokens": 128607217.0, "step": 3368 }, { "epoch": 0.42857142857142855, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.673425674438477, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8574284315109253, "num_tokens": 128642787.0, "step": 3369 }, { "epoch": 0.4286986388500191, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.67547035217285, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8619385361671448, "num_tokens": 128680462.0, "step": 3370 }, { "epoch": 0.4288258491286096, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.95415687561035, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8475086688995361, "num_tokens": 128715101.0, "step": 3371 }, { "epoch": 0.4289530594072001, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.73574447631836, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8427342772483826, "num_tokens": 128749601.0, "step": 3372 }, { "epoch": 0.4290802696857906, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.822824478149414, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.846738338470459, "num_tokens": 128788744.0, "step": 3373 }, { "epoch": 0.42920747996438113, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.54727554321289, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8472042679786682, "num_tokens": 128828102.0, "step": 3374 }, { "epoch": 0.4293346902429716, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.776203155517578, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8453737497329712, "num_tokens": 128863610.0, "step": 3375 }, { "epoch": 0.42946190052156213, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.0573673248291, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8652092814445496, "num_tokens": 128898836.0, "step": 3376 }, { "epoch": 0.42958911080015266, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.846162796020508, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.852698802947998, "num_tokens": 128934821.0, "step": 3377 }, { "epoch": 0.42971632107874314, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.1334228515625, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8495831489562988, "num_tokens": 128966914.0, "step": 3378 }, { "epoch": 0.42984353135733366, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.28877067565918, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8579098582267761, "num_tokens": 129008305.0, "step": 3379 }, { "epoch": 0.4299707416359242, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.237640380859375, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8709405660629272, "num_tokens": 129043331.0, "step": 3380 }, { "epoch": 0.4300979519145147, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.293672561645508, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8573711514472961, "num_tokens": 129083513.0, "step": 3381 }, { "epoch": 0.4302251621931052, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.38747787475586, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8502509593963623, "num_tokens": 129113463.0, "step": 3382 }, { "epoch": 0.4303523724716957, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.477397918701172, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8270226716995239, "num_tokens": 129153080.0, "step": 3383 }, { "epoch": 0.43047958275028625, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.227558135986328, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8479289412498474, "num_tokens": 129193338.0, "step": 3384 }, { "epoch": 0.4306067930288767, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.744482040405273, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8412384390830994, "num_tokens": 129234816.0, "step": 3385 }, { "epoch": 0.43073400330746725, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.879314422607422, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8530808687210083, "num_tokens": 129275228.0, "step": 3386 }, { "epoch": 0.4308612135860578, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.05475425720215, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8324536681175232, "num_tokens": 129309747.0, "step": 3387 }, { "epoch": 0.43098842386464825, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.02150535583496, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8540393710136414, "num_tokens": 129342580.0, "step": 3388 }, { "epoch": 0.4311156341432388, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.92581558227539, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.845816969871521, "num_tokens": 129376599.0, "step": 3389 }, { "epoch": 0.4312428444218293, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.819833755493164, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8492832779884338, "num_tokens": 129414783.0, "step": 3390 }, { "epoch": 0.4313700547004198, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.759143829345703, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8501700758934021, "num_tokens": 129450287.0, "step": 3391 }, { "epoch": 0.4314972649790103, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.029373168945312, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8336949348449707, "num_tokens": 129486733.0, "step": 3392 }, { "epoch": 0.43162447525760084, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.796051025390625, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8505961894989014, "num_tokens": 129524392.0, "step": 3393 }, { "epoch": 0.4317516855361913, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.00386619567871, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8362346887588501, "num_tokens": 129558452.0, "step": 3394 }, { "epoch": 0.43187889581478184, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.001392364501953, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8489723205566406, "num_tokens": 129597076.0, "step": 3395 }, { "epoch": 0.43200610609337237, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.408344268798828, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.843800961971283, "num_tokens": 129635861.0, "step": 3396 }, { "epoch": 0.43213331637196284, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.67031478881836, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8456481099128723, "num_tokens": 129673760.0, "step": 3397 }, { "epoch": 0.43226052665055337, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.47226905822754, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8522952795028687, "num_tokens": 129710730.0, "step": 3398 }, { "epoch": 0.4323877369291439, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.990264892578125, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8470885157585144, "num_tokens": 129748724.0, "step": 3399 }, { "epoch": 0.43251494720773437, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.10633087158203, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8633053302764893, "num_tokens": 129787478.0, "step": 3400 }, { "epoch": 0.4326421574863249, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.018800735473633, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8610658049583435, "num_tokens": 129829852.0, "step": 3401 }, { "epoch": 0.4327693677649154, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.162307739257812, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8495166897773743, "num_tokens": 129869279.0, "step": 3402 }, { "epoch": 0.4328965780435059, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.99005126953125, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8460025787353516, "num_tokens": 129903347.0, "step": 3403 }, { "epoch": 0.4330237883220964, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.03112030029297, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8608351349830627, "num_tokens": 129940028.0, "step": 3404 }, { "epoch": 0.43315099860068695, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 26.09272003173828, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8602930903434753, "num_tokens": 129976438.0, "step": 3405 }, { "epoch": 0.4332782088792774, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.679073333740234, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8553684949874878, "num_tokens": 130015698.0, "step": 3406 }, { "epoch": 0.43340541915786795, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.026872634887695, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8650853633880615, "num_tokens": 130056000.0, "step": 3407 }, { "epoch": 0.4335326294364585, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.88953971862793, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8472893834114075, "num_tokens": 130096990.0, "step": 3408 }, { "epoch": 0.43365983971504896, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.129146575927734, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8606998920440674, "num_tokens": 130128556.0, "step": 3409 }, { "epoch": 0.4337870499936395, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.866697311401367, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8580896854400635, "num_tokens": 130170607.0, "step": 3410 }, { "epoch": 0.43391426027223, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.261625289916992, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8468988537788391, "num_tokens": 130212511.0, "step": 3411 }, { "epoch": 0.4340414705508205, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.101430892944336, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.843993067741394, "num_tokens": 130249246.0, "step": 3412 }, { "epoch": 0.434168680829411, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.165874481201172, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8716385364532471, "num_tokens": 130291250.0, "step": 3413 }, { "epoch": 0.43429589110800154, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.09412384033203, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8312458395957947, "num_tokens": 130327094.0, "step": 3414 }, { "epoch": 0.434423101386592, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.29216957092285, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8426505327224731, "num_tokens": 130366482.0, "step": 3415 }, { "epoch": 0.43455031166518254, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.096050262451172, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8514885306358337, "num_tokens": 130407416.0, "step": 3416 }, { "epoch": 0.43467752194377307, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.90995216369629, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8567860722541809, "num_tokens": 130447125.0, "step": 3417 }, { "epoch": 0.43480473222236354, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.275646209716797, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.855864405632019, "num_tokens": 130480955.0, "step": 3418 }, { "epoch": 0.43493194250095407, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.787809371948242, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8630366325378418, "num_tokens": 130511771.0, "step": 3419 }, { "epoch": 0.4350591527795446, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.39585304260254, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8498369455337524, "num_tokens": 130550080.0, "step": 3420 }, { "epoch": 0.4351863630581351, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.63370132446289, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8384526968002319, "num_tokens": 130588109.0, "step": 3421 }, { "epoch": 0.4353135733367256, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.364242553710938, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8366602063179016, "num_tokens": 130626832.0, "step": 3422 }, { "epoch": 0.43544078361531613, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.871051788330078, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8554129004478455, "num_tokens": 130662202.0, "step": 3423 }, { "epoch": 0.4355679938939066, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.438451766967773, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.846493124961853, "num_tokens": 130700747.0, "step": 3424 }, { "epoch": 0.43569520417249713, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 25.79450225830078, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8488202095031738, "num_tokens": 130742849.0, "step": 3425 }, { "epoch": 0.43582241445108766, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.316890716552734, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8645936250686646, "num_tokens": 130780156.0, "step": 3426 }, { "epoch": 0.43594962472967813, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 25.879005432128906, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8506592512130737, "num_tokens": 130825824.0, "step": 3427 }, { "epoch": 0.43607683500826866, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.32245445251465, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8599005341529846, "num_tokens": 130863603.0, "step": 3428 }, { "epoch": 0.4362040452868592, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.225149154663086, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8530820608139038, "num_tokens": 130906689.0, "step": 3429 }, { "epoch": 0.4363312555654497, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.127342224121094, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8477414846420288, "num_tokens": 130947785.0, "step": 3430 }, { "epoch": 0.4364584658440402, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.168298721313477, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8622515201568604, "num_tokens": 130985489.0, "step": 3431 }, { "epoch": 0.4365856761226307, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.390544891357422, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.860274076461792, "num_tokens": 131025274.0, "step": 3432 }, { "epoch": 0.43671288640122125, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.379899978637695, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8507203459739685, "num_tokens": 131060132.0, "step": 3433 }, { "epoch": 0.4368400966798117, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.305879592895508, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8406413793563843, "num_tokens": 131105109.0, "step": 3434 }, { "epoch": 0.43696730695840225, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.12299156188965, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.865862250328064, "num_tokens": 131142094.0, "step": 3435 }, { "epoch": 0.4370945172369928, "ewc_loss": 0.054443359375, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.2438507080078125e-05, "grad_norm": 26.424562454223633, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8505039811134338, "num_tokens": 131174648.0, "step": 3436 }, { "epoch": 0.43722172751558325, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.324031829833984, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8668524622917175, "num_tokens": 131213447.0, "step": 3437 }, { "epoch": 0.4373489377941738, "ewc_loss": 0.0546875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.267692565917969e-05, "grad_norm": 26.122413635253906, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8487301468849182, "num_tokens": 131249622.0, "step": 3438 }, { "epoch": 0.4374761480727643, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.134775161743164, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8682291507720947, "num_tokens": 131285883.0, "step": 3439 }, { "epoch": 0.4376033583513548, "ewc_loss": 0.054931640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.291534423828125e-05, "grad_norm": 26.01163673400879, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8557467460632324, "num_tokens": 131322774.0, "step": 3440 }, { "epoch": 0.4377305686299453, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.034229278564453, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8628219366073608, "num_tokens": 131364513.0, "step": 3441 }, { "epoch": 0.43785777890853583, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.991165161132812, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8431242108345032, "num_tokens": 131402299.0, "step": 3442 }, { "epoch": 0.4379849891871263, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.1935977935791, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8587951064109802, "num_tokens": 131447830.0, "step": 3443 }, { "epoch": 0.43811219946571683, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.986282348632812, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8450464010238647, "num_tokens": 131481058.0, "step": 3444 }, { "epoch": 0.43823940974430736, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.1435489654541, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8479178547859192, "num_tokens": 131518376.0, "step": 3445 }, { "epoch": 0.43836662002289783, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.116825103759766, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8371316194534302, "num_tokens": 131553702.0, "step": 3446 }, { "epoch": 0.43849383030148836, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.028579711914062, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8599981665611267, "num_tokens": 131587026.0, "step": 3447 }, { "epoch": 0.4386210405800789, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.31625747680664, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8579621315002441, "num_tokens": 131621145.0, "step": 3448 }, { "epoch": 0.43874825085866936, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.115678787231445, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8579963445663452, "num_tokens": 131661618.0, "step": 3449 }, { "epoch": 0.4388754611372599, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 26.001203536987305, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8448160290718079, "num_tokens": 131693840.0, "step": 3450 }, { "epoch": 0.4390026714158504, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.286046981811523, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8543060421943665, "num_tokens": 131730856.0, "step": 3451 }, { "epoch": 0.4391298816944409, "ewc_loss": 0.055419921875, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.3392181396484375e-05, "grad_norm": 26.008195877075195, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8280420303344727, "num_tokens": 131769172.0, "step": 3452 }, { "epoch": 0.4392570919730314, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.093093872070312, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8543899059295654, "num_tokens": 131810481.0, "step": 3453 }, { "epoch": 0.43938430225162195, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.45643424987793, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8525444269180298, "num_tokens": 131850742.0, "step": 3454 }, { "epoch": 0.4395115125302124, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.509605407714844, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8514137268066406, "num_tokens": 131886675.0, "step": 3455 }, { "epoch": 0.43963872280880295, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.305931091308594, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8572688698768616, "num_tokens": 131926831.0, "step": 3456 }, { "epoch": 0.4397659330873935, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.073923110961914, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.851087749004364, "num_tokens": 131962265.0, "step": 3457 }, { "epoch": 0.43989314336598395, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.121540069580078, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8459246158599854, "num_tokens": 131999906.0, "step": 3458 }, { "epoch": 0.4400203536445745, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.06182098388672, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8675094246864319, "num_tokens": 132031330.0, "step": 3459 }, { "epoch": 0.440147563923165, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.87339210510254, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8501895666122437, "num_tokens": 132065049.0, "step": 3460 }, { "epoch": 0.4402747742017555, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.1121826171875, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8557950854301453, "num_tokens": 132100993.0, "step": 3461 }, { "epoch": 0.440401984480346, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.831260681152344, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8500549793243408, "num_tokens": 132138169.0, "step": 3462 }, { "epoch": 0.44052919475893654, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.863725662231445, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8596484065055847, "num_tokens": 132170498.0, "step": 3463 }, { "epoch": 0.440656405037527, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.87520980834961, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8334693908691406, "num_tokens": 132213583.0, "step": 3464 }, { "epoch": 0.44078361531611754, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.835981369018555, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8471469879150391, "num_tokens": 132255427.0, "step": 3465 }, { "epoch": 0.44091082559470807, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.159282684326172, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8508924245834351, "num_tokens": 132295795.0, "step": 3466 }, { "epoch": 0.44103803587329854, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.698850631713867, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8339564800262451, "num_tokens": 132336881.0, "step": 3467 }, { "epoch": 0.44116524615188907, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.767576217651367, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8520993590354919, "num_tokens": 132375136.0, "step": 3468 }, { "epoch": 0.4412924564304796, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.181699752807617, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8496781587600708, "num_tokens": 132412893.0, "step": 3469 }, { "epoch": 0.44141966670907007, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.207189559936523, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8443505764007568, "num_tokens": 132444360.0, "step": 3470 }, { "epoch": 0.4415468769876606, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.71347427368164, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.825397253036499, "num_tokens": 132485022.0, "step": 3471 }, { "epoch": 0.4416740872662511, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.092815399169922, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.837130069732666, "num_tokens": 132522800.0, "step": 3472 }, { "epoch": 0.4418012975448416, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.763172149658203, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8533369302749634, "num_tokens": 132560527.0, "step": 3473 }, { "epoch": 0.4419285078234321, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.338336944580078, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8486741185188293, "num_tokens": 132601075.0, "step": 3474 }, { "epoch": 0.44205571810202265, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.604652404785156, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8416019678115845, "num_tokens": 132639851.0, "step": 3475 }, { "epoch": 0.4421829283806131, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.035186767578125, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8533689379692078, "num_tokens": 132679849.0, "step": 3476 }, { "epoch": 0.44231013865920366, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.895740509033203, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.856203556060791, "num_tokens": 132721020.0, "step": 3477 }, { "epoch": 0.4424373489377942, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.913532257080078, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8385623693466187, "num_tokens": 132760933.0, "step": 3478 }, { "epoch": 0.44256455921638466, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.844013214111328, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8619896769523621, "num_tokens": 132800627.0, "step": 3479 }, { "epoch": 0.4426917694949752, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.92930793762207, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8488830924034119, "num_tokens": 132833232.0, "step": 3480 }, { "epoch": 0.4428189797735657, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.876062393188477, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.848967969417572, "num_tokens": 132874670.0, "step": 3481 }, { "epoch": 0.44294619005215624, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.054121017456055, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8529948592185974, "num_tokens": 132916187.0, "step": 3482 }, { "epoch": 0.4430734003307467, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.85698127746582, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8545961380004883, "num_tokens": 132956948.0, "step": 3483 }, { "epoch": 0.44320061060933724, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.982622146606445, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8571793437004089, "num_tokens": 132995213.0, "step": 3484 }, { "epoch": 0.44332782088792777, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.898914337158203, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.848717451095581, "num_tokens": 133038850.0, "step": 3485 }, { "epoch": 0.44345503116651824, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.853063583374023, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8534911274909973, "num_tokens": 133080766.0, "step": 3486 }, { "epoch": 0.44358224144510877, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.145793914794922, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8482104539871216, "num_tokens": 133120845.0, "step": 3487 }, { "epoch": 0.4437094517236993, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.73959732055664, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8490444421768188, "num_tokens": 133166011.0, "step": 3488 }, { "epoch": 0.4438366620022898, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.221240997314453, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8495739698410034, "num_tokens": 133203507.0, "step": 3489 }, { "epoch": 0.4439638722808803, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.70105743408203, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8283776044845581, "num_tokens": 133240554.0, "step": 3490 }, { "epoch": 0.44409108255947083, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.28208351135254, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8343257904052734, "num_tokens": 133274987.0, "step": 3491 }, { "epoch": 0.4442182928380613, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.076499938964844, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8242028951644897, "num_tokens": 133319287.0, "step": 3492 }, { "epoch": 0.44434550311665183, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.035730361938477, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8556482791900635, "num_tokens": 133353670.0, "step": 3493 }, { "epoch": 0.44447271339524236, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.024036407470703, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8533671498298645, "num_tokens": 133391950.0, "step": 3494 }, { "epoch": 0.44459992367383283, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.010147094726562, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8556643724441528, "num_tokens": 133424758.0, "step": 3495 }, { "epoch": 0.44472713395242336, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.36453628540039, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8793643712997437, "num_tokens": 133459869.0, "step": 3496 }, { "epoch": 0.4448543442310139, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.984336853027344, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8502466678619385, "num_tokens": 133494598.0, "step": 3497 }, { "epoch": 0.44498155450960436, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.234582901000977, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8598872423171997, "num_tokens": 133536403.0, "step": 3498 }, { "epoch": 0.4451087647881949, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.20157814025879, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8298144340515137, "num_tokens": 133567319.0, "step": 3499 }, { "epoch": 0.4452359750667854, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.03007698059082, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8433170318603516, "num_tokens": 133604829.0, "step": 3500 }, { "epoch": 0.4453631853453759, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.272714614868164, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8506685495376587, "num_tokens": 133638373.0, "step": 3501 }, { "epoch": 0.4454903956239664, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.12976837158203, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.862474799156189, "num_tokens": 133674110.0, "step": 3502 }, { "epoch": 0.44561760590255695, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.305646896362305, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8529632091522217, "num_tokens": 133711005.0, "step": 3503 }, { "epoch": 0.4457448161811474, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.306467056274414, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.863312840461731, "num_tokens": 133746258.0, "step": 3504 }, { "epoch": 0.44587202645973795, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.995067596435547, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8598624467849731, "num_tokens": 133790695.0, "step": 3505 }, { "epoch": 0.4459992367383285, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.17432975769043, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8474213480949402, "num_tokens": 133830181.0, "step": 3506 }, { "epoch": 0.44612644701691895, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.062599182128906, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8547489643096924, "num_tokens": 133866849.0, "step": 3507 }, { "epoch": 0.4462536572955095, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.058917999267578, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8498488664627075, "num_tokens": 133906812.0, "step": 3508 }, { "epoch": 0.4463808675741, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.02640151977539, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8442428112030029, "num_tokens": 133948956.0, "step": 3509 }, { "epoch": 0.4465080778526905, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 25.923423767089844, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8337094187736511, "num_tokens": 133988764.0, "step": 3510 }, { "epoch": 0.446635288131281, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.03546905517578, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8560693264007568, "num_tokens": 134028148.0, "step": 3511 }, { "epoch": 0.44676249840987153, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.467517852783203, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8269295692443848, "num_tokens": 134065909.0, "step": 3512 }, { "epoch": 0.446889708688462, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.704795837402344, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8582931756973267, "num_tokens": 134108342.0, "step": 3513 }, { "epoch": 0.44701691896705253, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 28.125694274902344, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.850330114364624, "num_tokens": 134145928.0, "step": 3514 }, { "epoch": 0.44714412924564306, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.382427215576172, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8466041088104248, "num_tokens": 134185955.0, "step": 3515 }, { "epoch": 0.44727133952423354, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 29.214189529418945, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.857944667339325, "num_tokens": 134225829.0, "step": 3516 }, { "epoch": 0.44739854980282406, "ewc_loss": 0.05615234375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 25.871845245361328, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.840901255607605, "num_tokens": 134261381.0, "step": 3517 }, { "epoch": 0.4475257600814146, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.721654891967773, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8586363792419434, "num_tokens": 134294697.0, "step": 3518 }, { "epoch": 0.44765297036000506, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.39019775390625, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8713939785957336, "num_tokens": 134334349.0, "step": 3519 }, { "epoch": 0.4477801806385956, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 27.30810546875, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8404296636581421, "num_tokens": 134373381.0, "step": 3520 }, { "epoch": 0.4479073909171861, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 26.089252471923828, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.856183648109436, "num_tokens": 134407952.0, "step": 3521 }, { "epoch": 0.4480346011957766, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.371788024902344, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8406707048416138, "num_tokens": 134458862.0, "step": 3522 }, { "epoch": 0.4481618114743671, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.442520141601562, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8482943773269653, "num_tokens": 134494007.0, "step": 3523 }, { "epoch": 0.44828902175295765, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.9595947265625, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8373163938522339, "num_tokens": 134534195.0, "step": 3524 }, { "epoch": 0.4484162320315481, "ewc_loss": 0.0556640625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.342777252197266, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8584764003753662, "num_tokens": 134570241.0, "step": 3525 }, { "epoch": 0.44854344231013865, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 28.535367965698242, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8492801189422607, "num_tokens": 134611904.0, "step": 3526 }, { "epoch": 0.4486706525887292, "ewc_loss": 0.05517578125, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.315376281738281e-05, "grad_norm": 25.760038375854492, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8422281742095947, "num_tokens": 134644426.0, "step": 3527 }, { "epoch": 0.44879786286731965, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 27.695556640625, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8603509068489075, "num_tokens": 134678979.0, "step": 3528 }, { "epoch": 0.4489250731459102, "ewc_loss": 0.055908203125, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 4.363059997558594e-05, "grad_norm": 26.87533187866211, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8537126779556274, "num_tokens": 134717270.0, "step": 3529 }, { "epoch": 0.4490522834245007, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.356937408447266, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8513960838317871, "num_tokens": 134755464.0, "step": 3530 }, { "epoch": 0.44917949370309124, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 25.962364196777344, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.843751847743988, "num_tokens": 134793737.0, "step": 3531 }, { "epoch": 0.4493067039816817, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.633468627929688, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8460724353790283, "num_tokens": 134834332.0, "step": 3532 }, { "epoch": 0.44943391426027224, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.410743713378906e-05, "grad_norm": 26.546260833740234, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8611820936203003, "num_tokens": 134871027.0, "step": 3533 }, { "epoch": 0.44956112453886277, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.297372817993164, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8506650924682617, "num_tokens": 134908529.0, "step": 3534 }, { "epoch": 0.44968833481745324, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.2833251953125, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8604558706283569, "num_tokens": 134944177.0, "step": 3535 }, { "epoch": 0.44981554509604377, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.51695442199707, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.850321888923645, "num_tokens": 134977928.0, "step": 3536 }, { "epoch": 0.4499427553746343, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.514699935913086, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8510292172431946, "num_tokens": 135017554.0, "step": 3537 }, { "epoch": 0.45006996565322477, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.103683471679688, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8566449284553528, "num_tokens": 135058694.0, "step": 3538 }, { "epoch": 0.4501971759318153, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.76959800720215, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8523598909378052, "num_tokens": 135094037.0, "step": 3539 }, { "epoch": 0.4503243862104058, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.213607788085938, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.845721960067749, "num_tokens": 135132758.0, "step": 3540 }, { "epoch": 0.4504515964889963, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.148914337158203, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8634625673294067, "num_tokens": 135169398.0, "step": 3541 }, { "epoch": 0.4505788067675868, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.595523834228516, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8365123271942139, "num_tokens": 135204312.0, "step": 3542 }, { "epoch": 0.45070601704617735, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.15601348876953, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8484488725662231, "num_tokens": 135246287.0, "step": 3543 }, { "epoch": 0.4508332273247678, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.166982650756836, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8556072115898132, "num_tokens": 135280970.0, "step": 3544 }, { "epoch": 0.45096043760335836, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.4134521484375, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8721965551376343, "num_tokens": 135317873.0, "step": 3545 }, { "epoch": 0.4510876478819489, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.089447021484375, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8599237203598022, "num_tokens": 135354994.0, "step": 3546 }, { "epoch": 0.45121485816053936, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.165292739868164, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8464103937149048, "num_tokens": 135393050.0, "step": 3547 }, { "epoch": 0.4513420684391299, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.240938186645508, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8560301661491394, "num_tokens": 135431704.0, "step": 3548 }, { "epoch": 0.4514692787177204, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.233381271362305, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8282514214515686, "num_tokens": 135471306.0, "step": 3549 }, { "epoch": 0.4515964889963109, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.414438247680664, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.83852219581604, "num_tokens": 135509647.0, "step": 3550 }, { "epoch": 0.4517236992749014, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.211641311645508, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8594087362289429, "num_tokens": 135550469.0, "step": 3551 }, { "epoch": 0.45185090955349194, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.425559997558594, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8512865304946899, "num_tokens": 135589290.0, "step": 3552 }, { "epoch": 0.4519781198320824, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.297931671142578, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8452426195144653, "num_tokens": 135624416.0, "step": 3553 }, { "epoch": 0.45210533011067294, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.445436477661133, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8551793694496155, "num_tokens": 135662316.0, "step": 3554 }, { "epoch": 0.45223254038926347, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.10856056213379, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8539689779281616, "num_tokens": 135701739.0, "step": 3555 }, { "epoch": 0.45235975066785394, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.675294876098633, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8480214476585388, "num_tokens": 135741372.0, "step": 3556 }, { "epoch": 0.45248696094644447, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.189756393432617, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8747769594192505, "num_tokens": 135779945.0, "step": 3557 }, { "epoch": 0.452614171225035, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.399259567260742, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8454701900482178, "num_tokens": 135813179.0, "step": 3558 }, { "epoch": 0.4527413815036255, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.286087036132812, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8532599210739136, "num_tokens": 135855147.0, "step": 3559 }, { "epoch": 0.452868591782216, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.53257942199707, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8596991300582886, "num_tokens": 135892990.0, "step": 3560 }, { "epoch": 0.45299580206080653, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.488862991333008, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8707785606384277, "num_tokens": 135926911.0, "step": 3561 }, { "epoch": 0.453123012339397, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.578092575073242, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8603258728981018, "num_tokens": 135963530.0, "step": 3562 }, { "epoch": 0.45325022261798753, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.428924560546875, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8419591188430786, "num_tokens": 136000677.0, "step": 3563 }, { "epoch": 0.45337743289657806, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.821672439575195, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8415890336036682, "num_tokens": 136037283.0, "step": 3564 }, { "epoch": 0.45350464317516853, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.025493621826172, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8329089879989624, "num_tokens": 136071454.0, "step": 3565 }, { "epoch": 0.45363185345375906, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.51727867126465, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8411076068878174, "num_tokens": 136105638.0, "step": 3566 }, { "epoch": 0.4537590637323496, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.466346740722656, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8549967408180237, "num_tokens": 136147968.0, "step": 3567 }, { "epoch": 0.45388627401094006, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.279640197753906, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8520373702049255, "num_tokens": 136190468.0, "step": 3568 }, { "epoch": 0.4540134842895306, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.49321746826172, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8558008670806885, "num_tokens": 136229363.0, "step": 3569 }, { "epoch": 0.4541406945681211, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.160799026489258, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8441100120544434, "num_tokens": 136268834.0, "step": 3570 }, { "epoch": 0.4542679048467116, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.987218856811523, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8284550905227661, "num_tokens": 136306761.0, "step": 3571 }, { "epoch": 0.4543951151253021, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.16518783569336, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8522705435752869, "num_tokens": 136347736.0, "step": 3572 }, { "epoch": 0.45452232540389265, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.5153751373291, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8535796403884888, "num_tokens": 136381009.0, "step": 3573 }, { "epoch": 0.4546495356824831, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.292707443237305, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8615931272506714, "num_tokens": 136418050.0, "step": 3574 }, { "epoch": 0.45477674596107365, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.82695960998535, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8335293531417847, "num_tokens": 136459109.0, "step": 3575 }, { "epoch": 0.4549039562396642, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.32090950012207, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.847368597984314, "num_tokens": 136497993.0, "step": 3576 }, { "epoch": 0.45503116651825465, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.55885124206543, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.854341983795166, "num_tokens": 136534446.0, "step": 3577 }, { "epoch": 0.4551583767968452, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.36407470703125, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8363455533981323, "num_tokens": 136571117.0, "step": 3578 }, { "epoch": 0.4552855870754357, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.693981170654297, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8586833477020264, "num_tokens": 136601706.0, "step": 3579 }, { "epoch": 0.4554127973540262, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.286705017089844, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8524731397628784, "num_tokens": 136639735.0, "step": 3580 }, { "epoch": 0.4555400076326167, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.561954498291016, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8451429009437561, "num_tokens": 136675647.0, "step": 3581 }, { "epoch": 0.45566721791120723, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.488550186157227, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8586587905883789, "num_tokens": 136713361.0, "step": 3582 }, { "epoch": 0.45579442818979776, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.669845581054688, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8496977090835571, "num_tokens": 136751440.0, "step": 3583 }, { "epoch": 0.45592163846838824, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.314098358154297, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8475766777992249, "num_tokens": 136786078.0, "step": 3584 }, { "epoch": 0.45604884874697876, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.53811264038086, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.858778715133667, "num_tokens": 136828313.0, "step": 3585 }, { "epoch": 0.4561760590255693, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.283836364746094, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8494933843612671, "num_tokens": 136862549.0, "step": 3586 }, { "epoch": 0.45630326930415976, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.158321380615234, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8548324108123779, "num_tokens": 136906423.0, "step": 3587 }, { "epoch": 0.4564304795827503, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.615949630737305, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8400135636329651, "num_tokens": 136941550.0, "step": 3588 }, { "epoch": 0.4565576898613408, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.1727294921875, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8544405698776245, "num_tokens": 136980234.0, "step": 3589 }, { "epoch": 0.4566849001399313, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.484189987182617, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8549278974533081, "num_tokens": 137023991.0, "step": 3590 }, { "epoch": 0.4568121104185218, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.188268661499023, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8426614999771118, "num_tokens": 137063261.0, "step": 3591 }, { "epoch": 0.45693932069711235, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.43454933166504, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8605031967163086, "num_tokens": 137103935.0, "step": 3592 }, { "epoch": 0.4570665309757028, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.339885711669922, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8527992963790894, "num_tokens": 137140194.0, "step": 3593 }, { "epoch": 0.45719374125429335, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.462247848510742, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.842218279838562, "num_tokens": 137179086.0, "step": 3594 }, { "epoch": 0.4573209515328839, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.56796646118164, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8562778234481812, "num_tokens": 137216887.0, "step": 3595 }, { "epoch": 0.45744816181147435, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.48600959777832, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8669500350952148, "num_tokens": 137255029.0, "step": 3596 }, { "epoch": 0.4575753720900649, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.79667091369629, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8541659116744995, "num_tokens": 137291399.0, "step": 3597 }, { "epoch": 0.4577025823686554, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.64740753173828, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8550576567649841, "num_tokens": 137326569.0, "step": 3598 }, { "epoch": 0.4578297926472459, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 27.720745086669922, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8480044603347778, "num_tokens": 137370035.0, "step": 3599 }, { "epoch": 0.4579570029258364, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 27.784664154052734, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8482889533042908, "num_tokens": 137407969.0, "step": 3600 }, { "epoch": 0.45808421320442694, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 28.07777976989746, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8501214981079102, "num_tokens": 137447256.0, "step": 3601 }, { "epoch": 0.4582114234830174, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.810287475585938, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8369169235229492, "num_tokens": 137479762.0, "step": 3602 }, { "epoch": 0.45833863376160794, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.234209060668945, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.844810962677002, "num_tokens": 137521823.0, "step": 3603 }, { "epoch": 0.45846584404019847, "ewc_loss": 0.056640625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.4345855712890625e-05, "grad_norm": 26.392988204956055, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8498417139053345, "num_tokens": 137558174.0, "step": 3604 }, { "epoch": 0.45859305431878894, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 27.102720260620117, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8256548643112183, "num_tokens": 137597265.0, "step": 3605 }, { "epoch": 0.45872026459737947, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.304777145385742, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8597030639648438, "num_tokens": 137635954.0, "step": 3606 }, { "epoch": 0.45884747487597, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.86058807373047, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8545578122138977, "num_tokens": 137676137.0, "step": 3607 }, { "epoch": 0.45897468515456047, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.50194549560547, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8543843626976013, "num_tokens": 137716723.0, "step": 3608 }, { "epoch": 0.459101895433151, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.697031021118164, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8330368995666504, "num_tokens": 137752435.0, "step": 3609 }, { "epoch": 0.4592291057117415, "ewc_loss": 0.056884765625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.458427429199219e-05, "grad_norm": 26.352344512939453, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8428553342819214, "num_tokens": 137786684.0, "step": 3610 }, { "epoch": 0.459356315990332, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.149696350097656, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8436230421066284, "num_tokens": 137830668.0, "step": 3611 }, { "epoch": 0.4594835262689225, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.2910213470459, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.857555091381073, "num_tokens": 137865262.0, "step": 3612 }, { "epoch": 0.45961073654751305, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.56725311279297, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.858252227306366, "num_tokens": 137898867.0, "step": 3613 }, { "epoch": 0.4597379468261035, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.421579360961914, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8524477481842041, "num_tokens": 137935108.0, "step": 3614 }, { "epoch": 0.45986515710469406, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.457508087158203, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8632161617279053, "num_tokens": 137977714.0, "step": 3615 }, { "epoch": 0.4599923673832846, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.821613311767578, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8652874231338501, "num_tokens": 138006345.0, "step": 3616 }, { "epoch": 0.46011957766187506, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.462726593017578, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8512486219406128, "num_tokens": 138045106.0, "step": 3617 }, { "epoch": 0.4602467879404656, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.938322067260742, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8457260131835938, "num_tokens": 138079993.0, "step": 3618 }, { "epoch": 0.4603739982190561, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.512544631958008, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8541792631149292, "num_tokens": 138118555.0, "step": 3619 }, { "epoch": 0.4605012084976466, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.93299674987793, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8608173131942749, "num_tokens": 138155096.0, "step": 3620 }, { "epoch": 0.4606284187762371, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.65653419494629, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8538087010383606, "num_tokens": 138194529.0, "step": 3621 }, { "epoch": 0.46075562905482764, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.825550079345703, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8267720341682434, "num_tokens": 138235520.0, "step": 3622 }, { "epoch": 0.4608828393334181, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.601547241210938, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.860470175743103, "num_tokens": 138271391.0, "step": 3623 }, { "epoch": 0.46101004961200864, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.8575496673584, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8469102382659912, "num_tokens": 138313923.0, "step": 3624 }, { "epoch": 0.46113725989059917, "ewc_loss": 0.05712890625, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.482269287109375e-05, "grad_norm": 26.43426513671875, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8434325456619263, "num_tokens": 138355992.0, "step": 3625 }, { "epoch": 0.46126447016918964, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.792530059814453, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8716047406196594, "num_tokens": 138393074.0, "step": 3626 }, { "epoch": 0.4613916804477802, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.732473373413086, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8593732118606567, "num_tokens": 138427854.0, "step": 3627 }, { "epoch": 0.4615188907263707, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.801809310913086, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8617739677429199, "num_tokens": 138467359.0, "step": 3628 }, { "epoch": 0.4616461010049612, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.6955509185791, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.840347945690155, "num_tokens": 138505355.0, "step": 3629 }, { "epoch": 0.4617733112835517, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.80162811279297, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8573330640792847, "num_tokens": 138543703.0, "step": 3630 }, { "epoch": 0.46190052156214223, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.243759155273438, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8441017866134644, "num_tokens": 138579860.0, "step": 3631 }, { "epoch": 0.46202773184073276, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 27.230485916137695, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8614118099212646, "num_tokens": 138615755.0, "step": 3632 }, { "epoch": 0.46215494211932323, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.53993034362793, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8470907211303711, "num_tokens": 138647607.0, "step": 3633 }, { "epoch": 0.46228215239791376, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.978330612182617, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8578093647956848, "num_tokens": 138679284.0, "step": 3634 }, { "epoch": 0.4624093626765043, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.551666259765625, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8616629838943481, "num_tokens": 138719690.0, "step": 3635 }, { "epoch": 0.46253657295509476, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.833391189575195, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8405886888504028, "num_tokens": 138758031.0, "step": 3636 }, { "epoch": 0.4626637832336853, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.612627029418945, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8511569499969482, "num_tokens": 138797283.0, "step": 3637 }, { "epoch": 0.4627909935122758, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.66007423400879, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8484858870506287, "num_tokens": 138834672.0, "step": 3638 }, { "epoch": 0.4629182037908663, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.744338989257812, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8626184463500977, "num_tokens": 138871105.0, "step": 3639 }, { "epoch": 0.4630454140694568, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.4774169921875, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8521204590797424, "num_tokens": 138903516.0, "step": 3640 }, { "epoch": 0.46317262434804735, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.728404998779297, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8510755896568298, "num_tokens": 138942021.0, "step": 3641 }, { "epoch": 0.4632998346266378, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.301376342773438, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8605552315711975, "num_tokens": 138978210.0, "step": 3642 }, { "epoch": 0.46342704490522835, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.76643180847168, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8550586700439453, "num_tokens": 139019122.0, "step": 3643 }, { "epoch": 0.4635542551838189, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.630184173583984, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8634089231491089, "num_tokens": 139062187.0, "step": 3644 }, { "epoch": 0.46368146546240935, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.71039390563965, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.871523380279541, "num_tokens": 139102960.0, "step": 3645 }, { "epoch": 0.4638086757409999, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.654125213623047, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8533328771591187, "num_tokens": 139137302.0, "step": 3646 }, { "epoch": 0.4639358860195904, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.473846435546875, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8339444398880005, "num_tokens": 139177195.0, "step": 3647 }, { "epoch": 0.4640630962981809, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.09808349609375, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8482986092567444, "num_tokens": 139214722.0, "step": 3648 }, { "epoch": 0.4641903065767714, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.413755416870117, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8521500825881958, "num_tokens": 139248569.0, "step": 3649 }, { "epoch": 0.46431751685536193, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.693056106567383, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8578449487686157, "num_tokens": 139289141.0, "step": 3650 }, { "epoch": 0.4644447271339524, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.948442459106445, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8687750697135925, "num_tokens": 139330791.0, "step": 3651 }, { "epoch": 0.46457193741254293, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.87177276611328, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8203684091567993, "num_tokens": 139373009.0, "step": 3652 }, { "epoch": 0.46469914769113346, "ewc_loss": 0.057373046875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.947006225585938, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8519760370254517, "num_tokens": 139408894.0, "step": 3653 }, { "epoch": 0.46482635796972394, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.815576553344727, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8494371175765991, "num_tokens": 139444040.0, "step": 3654 }, { "epoch": 0.46495356824831446, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.452810287475586, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8587063550949097, "num_tokens": 139479675.0, "step": 3655 }, { "epoch": 0.465080778526905, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.91689682006836, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8597300052642822, "num_tokens": 139521792.0, "step": 3656 }, { "epoch": 0.46520798880549546, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.6219539642334, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8594760894775391, "num_tokens": 139563471.0, "step": 3657 }, { "epoch": 0.465335199084086, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.89963722229004, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8515346646308899, "num_tokens": 139604702.0, "step": 3658 }, { "epoch": 0.4654624093626765, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.426355361938477, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8504853844642639, "num_tokens": 139644573.0, "step": 3659 }, { "epoch": 0.465589619641267, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.612886428833008, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8378818035125732, "num_tokens": 139680924.0, "step": 3660 }, { "epoch": 0.4657168299198575, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.64158821105957, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8444556593894958, "num_tokens": 139724369.0, "step": 3661 }, { "epoch": 0.46584404019844805, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.77759552001953, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8525171279907227, "num_tokens": 139769412.0, "step": 3662 }, { "epoch": 0.4659712504770385, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.65265655517578, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8267989158630371, "num_tokens": 139808066.0, "step": 3663 }, { "epoch": 0.46609846075562905, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.373157501220703, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8575160503387451, "num_tokens": 139851169.0, "step": 3664 }, { "epoch": 0.4662256710342196, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.725934982299805, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8619325160980225, "num_tokens": 139890550.0, "step": 3665 }, { "epoch": 0.46635288131281005, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.63549041748047, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8261070251464844, "num_tokens": 139931496.0, "step": 3666 }, { "epoch": 0.4664800915914006, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.05304718017578, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8429715037345886, "num_tokens": 139965872.0, "step": 3667 }, { "epoch": 0.4666073018699911, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.43587875366211, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.836576521396637, "num_tokens": 140000394.0, "step": 3668 }, { "epoch": 0.4667345121485816, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.40387535095215, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8356480598449707, "num_tokens": 140039967.0, "step": 3669 }, { "epoch": 0.4668617224271721, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.685522079467773, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.852871835231781, "num_tokens": 140075739.0, "step": 3670 }, { "epoch": 0.46698893270576264, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.875450134277344, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.85687255859375, "num_tokens": 140110016.0, "step": 3671 }, { "epoch": 0.4671161429843531, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.79256248474121, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8416763544082642, "num_tokens": 140147905.0, "step": 3672 }, { "epoch": 0.46724335326294364, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 27.185413360595703, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.844886064529419, "num_tokens": 140182284.0, "step": 3673 }, { "epoch": 0.46737056354153417, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.578636169433594, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8440862894058228, "num_tokens": 140220669.0, "step": 3674 }, { "epoch": 0.46749777382012464, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.956562042236328, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8376504182815552, "num_tokens": 140258399.0, "step": 3675 }, { "epoch": 0.46762498409871517, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.76673126220703, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8662316799163818, "num_tokens": 140296015.0, "step": 3676 }, { "epoch": 0.4677521943773057, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 27.143104553222656, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8419200778007507, "num_tokens": 140338618.0, "step": 3677 }, { "epoch": 0.46787940465589617, "ewc_loss": 0.0576171875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.506111145019531e-05, "grad_norm": 26.536725997924805, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8557960987091064, "num_tokens": 140379151.0, "step": 3678 }, { "epoch": 0.4680066149344867, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.104080200195312, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.837421715259552, "num_tokens": 140417740.0, "step": 3679 }, { "epoch": 0.4681338252130772, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.409725189208984, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8480106592178345, "num_tokens": 140452454.0, "step": 3680 }, { "epoch": 0.46826103549166775, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.143760681152344, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8271212577819824, "num_tokens": 140492981.0, "step": 3681 }, { "epoch": 0.4683882457702582, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.6755428314209, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8408094048500061, "num_tokens": 140534131.0, "step": 3682 }, { "epoch": 0.46851545604884876, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.936641693115234, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8437594771385193, "num_tokens": 140576761.0, "step": 3683 }, { "epoch": 0.4686426663274393, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.557228088378906, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8316571712493896, "num_tokens": 140609538.0, "step": 3684 }, { "epoch": 0.46876987660602976, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.152664184570312, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8462479710578918, "num_tokens": 140641802.0, "step": 3685 }, { "epoch": 0.4688970868846203, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.712820053100586, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8400701284408569, "num_tokens": 140685832.0, "step": 3686 }, { "epoch": 0.4690242971632108, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.793851852416992, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8387719392776489, "num_tokens": 140725333.0, "step": 3687 }, { "epoch": 0.4691515074418013, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 27.218931198120117, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8500263690948486, "num_tokens": 140760838.0, "step": 3688 }, { "epoch": 0.4692787177203918, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.538684844970703, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8396010398864746, "num_tokens": 140797367.0, "step": 3689 }, { "epoch": 0.46940592799898234, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.154476165771484, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8500229120254517, "num_tokens": 140839778.0, "step": 3690 }, { "epoch": 0.4695331382775728, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.55507469177246, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8450268507003784, "num_tokens": 140880340.0, "step": 3691 }, { "epoch": 0.46966034855616334, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 27.049320220947266, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8449501991271973, "num_tokens": 140915451.0, "step": 3692 }, { "epoch": 0.46978755883475387, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.642738342285156, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8477575778961182, "num_tokens": 140956967.0, "step": 3693 }, { "epoch": 0.46991476911334434, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.73279571533203, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8658002614974976, "num_tokens": 140996640.0, "step": 3694 }, { "epoch": 0.47004197939193487, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.91440200805664, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8503286838531494, "num_tokens": 141036094.0, "step": 3695 }, { "epoch": 0.4701691896705254, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.81854820251465, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8402630686759949, "num_tokens": 141069033.0, "step": 3696 }, { "epoch": 0.4702963999491159, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.507055282592773, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8681668043136597, "num_tokens": 141111075.0, "step": 3697 }, { "epoch": 0.4704236102277064, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.834026336669922, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8655065894126892, "num_tokens": 141145270.0, "step": 3698 }, { "epoch": 0.47055082050629693, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.86115837097168, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8640903830528259, "num_tokens": 141182132.0, "step": 3699 }, { "epoch": 0.4706780307848874, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.794504165649414, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8507146835327148, "num_tokens": 141217455.0, "step": 3700 }, { "epoch": 0.47080524106347793, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.27823257446289, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8502933979034424, "num_tokens": 141249680.0, "step": 3701 }, { "epoch": 0.47093245134206846, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.691354751586914, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8517509698867798, "num_tokens": 141292085.0, "step": 3702 }, { "epoch": 0.47105966162065893, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.991832733154297, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.858983039855957, "num_tokens": 141331455.0, "step": 3703 }, { "epoch": 0.47118687189924946, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.179821014404297, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.851823091506958, "num_tokens": 141365623.0, "step": 3704 }, { "epoch": 0.47131408217784, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.796541213989258, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8423556089401245, "num_tokens": 141399033.0, "step": 3705 }, { "epoch": 0.47144129245643046, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.101505279541016, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8572689890861511, "num_tokens": 141430436.0, "step": 3706 }, { "epoch": 0.471568502735021, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.69002342224121, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.867887020111084, "num_tokens": 141463169.0, "step": 3707 }, { "epoch": 0.4716957130136115, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.239694595336914, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8677616119384766, "num_tokens": 141495524.0, "step": 3708 }, { "epoch": 0.471822923292202, "ewc_loss": 0.057861328125, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.5299530029296875e-05, "grad_norm": 26.571971893310547, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8615446090698242, "num_tokens": 141533099.0, "step": 3709 }, { "epoch": 0.4719501335707925, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.522363662719727, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8573914170265198, "num_tokens": 141576389.0, "step": 3710 }, { "epoch": 0.47207734384938305, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.55337905883789, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8543210029602051, "num_tokens": 141611870.0, "step": 3711 }, { "epoch": 0.4722045541279735, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.308473587036133, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8624286651611328, "num_tokens": 141648539.0, "step": 3712 }, { "epoch": 0.47233176440656405, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.685317993164062, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8556746244430542, "num_tokens": 141688549.0, "step": 3713 }, { "epoch": 0.4724589746851546, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.836843490600586, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8607750535011292, "num_tokens": 141725294.0, "step": 3714 }, { "epoch": 0.47258618496374505, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.67369270324707, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.835289716720581, "num_tokens": 141756761.0, "step": 3715 }, { "epoch": 0.4727133952423356, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.02351951599121, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8342673182487488, "num_tokens": 141790417.0, "step": 3716 }, { "epoch": 0.4728406055209261, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.572275161743164, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8637783527374268, "num_tokens": 141830749.0, "step": 3717 }, { "epoch": 0.4729678157995166, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.71875762939453, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8551861047744751, "num_tokens": 141868433.0, "step": 3718 }, { "epoch": 0.4730950260781071, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.578245162963867, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8688127994537354, "num_tokens": 141907140.0, "step": 3719 }, { "epoch": 0.47322223635669763, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 27.05730628967285, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8659191727638245, "num_tokens": 141947436.0, "step": 3720 }, { "epoch": 0.4733494466352881, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.704984664916992, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8439220190048218, "num_tokens": 141986876.0, "step": 3721 }, { "epoch": 0.47347665691387864, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.80849266052246, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.856406033039093, "num_tokens": 142025806.0, "step": 3722 }, { "epoch": 0.47360386719246916, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.801063537597656, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8524013757705688, "num_tokens": 142057932.0, "step": 3723 }, { "epoch": 0.47373107747105964, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.17998504638672, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8554626703262329, "num_tokens": 142088880.0, "step": 3724 }, { "epoch": 0.47385828774965016, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.543962478637695, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8656298518180847, "num_tokens": 142124959.0, "step": 3725 }, { "epoch": 0.4739854980282407, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.01661491394043, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8381924629211426, "num_tokens": 142160667.0, "step": 3726 }, { "epoch": 0.47411270830683117, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.592130661010742, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8510299921035767, "num_tokens": 142205017.0, "step": 3727 }, { "epoch": 0.4742399185854217, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.08547592163086, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8592222929000854, "num_tokens": 142241479.0, "step": 3728 }, { "epoch": 0.4743671288640122, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.673315048217773, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8544290065765381, "num_tokens": 142280881.0, "step": 3729 }, { "epoch": 0.4744943391426027, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.970848083496094, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8599222302436829, "num_tokens": 142328248.0, "step": 3730 }, { "epoch": 0.4746215494211932, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 26.76699447631836, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8359644412994385, "num_tokens": 142364872.0, "step": 3731 }, { "epoch": 0.47474875969978375, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.85309410095215, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8343541622161865, "num_tokens": 142400494.0, "step": 3732 }, { "epoch": 0.4748759699783743, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.97272300720215, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8488475680351257, "num_tokens": 142433739.0, "step": 3733 }, { "epoch": 0.47500318025696475, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 4.57763671875e-05, "grad_norm": 44.873924255371094, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8607904314994812, "num_tokens": 142469777.0, "step": 3734 }, { "epoch": 0.4751303905355553, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.78422737121582, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8434613943099976, "num_tokens": 142510052.0, "step": 3735 }, { "epoch": 0.4752576008141458, "ewc_loss": 0.056396484375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.38690185546875e-05, "grad_norm": 25.400184631347656, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8406261205673218, "num_tokens": 142545689.0, "step": 3736 }, { "epoch": 0.4753848110927363, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 28.036970138549805, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8521726727485657, "num_tokens": 142585916.0, "step": 3737 }, { "epoch": 0.4755120213713268, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.8148250579834, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8592599630355835, "num_tokens": 142623019.0, "step": 3738 }, { "epoch": 0.47563923164991734, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.385486602783203, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8494813442230225, "num_tokens": 142660986.0, "step": 3739 }, { "epoch": 0.4757664419285078, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.804716110229492, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8554059267044067, "num_tokens": 142702876.0, "step": 3740 }, { "epoch": 0.47589365220709834, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.367097854614258, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8408186435699463, "num_tokens": 142744082.0, "step": 3741 }, { "epoch": 0.47602086248568887, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.789487838745117, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8332468867301941, "num_tokens": 142777511.0, "step": 3742 }, { "epoch": 0.47614807276427934, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.108213424682617, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8468055725097656, "num_tokens": 142818882.0, "step": 3743 }, { "epoch": 0.47627528304286987, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 26.709178924560547, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8549413084983826, "num_tokens": 142852771.0, "step": 3744 }, { "epoch": 0.4764024933214604, "ewc_loss": 0.058349609375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.2545108795166, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8519248366355896, "num_tokens": 142888398.0, "step": 3745 }, { "epoch": 0.47652970360005087, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.880901336669922, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8680546283721924, "num_tokens": 142924789.0, "step": 3746 }, { "epoch": 0.4766569138786414, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.081775665283203, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8593580722808838, "num_tokens": 142958843.0, "step": 3747 }, { "epoch": 0.4767841241572319, "ewc_loss": 0.05810546875, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.553794860839844e-05, "grad_norm": 26.93113899230957, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8639054298400879, "num_tokens": 142996408.0, "step": 3748 }, { "epoch": 0.4769113344358224, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.739866256713867, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8441843390464783, "num_tokens": 143040031.0, "step": 3749 }, { "epoch": 0.4770385447144129, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.932857513427734, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8313273191452026, "num_tokens": 143081648.0, "step": 3750 }, { "epoch": 0.47716575499300345, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.991920471191406, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8310290575027466, "num_tokens": 143118913.0, "step": 3751 }, { "epoch": 0.4772929652715939, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.979318618774414, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8614675998687744, "num_tokens": 143153858.0, "step": 3752 }, { "epoch": 0.47742017555018446, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.980802536010742, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8623694181442261, "num_tokens": 143190977.0, "step": 3753 }, { "epoch": 0.477547385828775, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.6708984375, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8441804051399231, "num_tokens": 143228009.0, "step": 3754 }, { "epoch": 0.47767459610736546, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.17490577697754, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8408046960830688, "num_tokens": 143269820.0, "step": 3755 }, { "epoch": 0.477801806385956, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.60826873779297, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8566775918006897, "num_tokens": 143303441.0, "step": 3756 }, { "epoch": 0.4779290166645465, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.905935287475586, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8435935974121094, "num_tokens": 143338748.0, "step": 3757 }, { "epoch": 0.478056226943137, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.740066528320312, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8517259359359741, "num_tokens": 143373900.0, "step": 3758 }, { "epoch": 0.4781834372217275, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.882495880126953, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8475894927978516, "num_tokens": 143411190.0, "step": 3759 }, { "epoch": 0.47831064750031804, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.691076278686523, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8487166166305542, "num_tokens": 143449470.0, "step": 3760 }, { "epoch": 0.4784378577789085, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.038951873779297, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8571242094039917, "num_tokens": 143485868.0, "step": 3761 }, { "epoch": 0.47856506805749904, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.96187400817871, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8607254028320312, "num_tokens": 143520334.0, "step": 3762 }, { "epoch": 0.47869227833608957, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.756744384765625, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8446651697158813, "num_tokens": 143557752.0, "step": 3763 }, { "epoch": 0.47881948861468004, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.9019718170166, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8523983359336853, "num_tokens": 143593385.0, "step": 3764 }, { "epoch": 0.4789466988932706, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.913185119628906, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8308486342430115, "num_tokens": 143631975.0, "step": 3765 }, { "epoch": 0.4790739091718611, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.56085205078125, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8346801996231079, "num_tokens": 143668382.0, "step": 3766 }, { "epoch": 0.4792011194504516, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.94913101196289, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8383224010467529, "num_tokens": 143706623.0, "step": 3767 }, { "epoch": 0.4793283297290421, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.594228744506836, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8550440073013306, "num_tokens": 143745664.0, "step": 3768 }, { "epoch": 0.47945554000763263, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.27834701538086, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.84251469373703, "num_tokens": 143780761.0, "step": 3769 }, { "epoch": 0.4795827502862231, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.743799209594727, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.856361985206604, "num_tokens": 143825298.0, "step": 3770 }, { "epoch": 0.47970996056481363, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.068973541259766, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8716038465499878, "num_tokens": 143859855.0, "step": 3771 }, { "epoch": 0.47983717084340416, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.824613571166992, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8656885623931885, "num_tokens": 143898553.0, "step": 3772 }, { "epoch": 0.47996438112199463, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.24070167541504, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8495693206787109, "num_tokens": 143935362.0, "step": 3773 }, { "epoch": 0.48009159140058516, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.866573333740234, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8531482219696045, "num_tokens": 143978692.0, "step": 3774 }, { "epoch": 0.4802188016791757, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.75507926940918, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8586106896400452, "num_tokens": 144020951.0, "step": 3775 }, { "epoch": 0.48034601195776616, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.825733184814453, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8613632917404175, "num_tokens": 144058341.0, "step": 3776 }, { "epoch": 0.4804732222363567, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.02588653564453, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8447250127792358, "num_tokens": 144098232.0, "step": 3777 }, { "epoch": 0.4806004325149472, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.8552188873291, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8474513292312622, "num_tokens": 144137537.0, "step": 3778 }, { "epoch": 0.4807276427935377, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.948726654052734, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8575997352600098, "num_tokens": 144178459.0, "step": 3779 }, { "epoch": 0.4808548530721282, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.04281997680664, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8470103144645691, "num_tokens": 144219257.0, "step": 3780 }, { "epoch": 0.48098206335071875, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.73969078063965, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8576672673225403, "num_tokens": 144262232.0, "step": 3781 }, { "epoch": 0.4811092736293093, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.15471076965332, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8298985958099365, "num_tokens": 144297144.0, "step": 3782 }, { "epoch": 0.48123648390789975, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.986995697021484, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8531873822212219, "num_tokens": 144336078.0, "step": 3783 }, { "epoch": 0.4813636941864903, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.009201049804688, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8448629975318909, "num_tokens": 144380162.0, "step": 3784 }, { "epoch": 0.4814909044650808, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.64020538330078, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8362671732902527, "num_tokens": 144416735.0, "step": 3785 }, { "epoch": 0.4816181147436713, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.138200759887695, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8405541181564331, "num_tokens": 144452780.0, "step": 3786 }, { "epoch": 0.4817453250222618, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.76163673400879, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8583114147186279, "num_tokens": 144494155.0, "step": 3787 }, { "epoch": 0.48187253530085233, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.023605346679688, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8615304827690125, "num_tokens": 144533989.0, "step": 3788 }, { "epoch": 0.4819997455794428, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 27.002727508544922, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8481971025466919, "num_tokens": 144577409.0, "step": 3789 }, { "epoch": 0.48212695585803333, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.105356216430664, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8447729349136353, "num_tokens": 144618501.0, "step": 3790 }, { "epoch": 0.48225416613662386, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.15322494506836, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8602970838546753, "num_tokens": 144659454.0, "step": 3791 }, { "epoch": 0.48238137641521434, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 26.97085189819336, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8575619459152222, "num_tokens": 144695580.0, "step": 3792 }, { "epoch": 0.48250858669380486, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.08487892150879, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8348405957221985, "num_tokens": 144735586.0, "step": 3793 }, { "epoch": 0.4826357969723954, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 26.84877586364746, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8525821566581726, "num_tokens": 144776371.0, "step": 3794 }, { "epoch": 0.48276300725098586, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.170677185058594, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.845955491065979, "num_tokens": 144816724.0, "step": 3795 }, { "epoch": 0.4828902175295764, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.223997116088867, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8590711951255798, "num_tokens": 144854591.0, "step": 3796 }, { "epoch": 0.4830174278081669, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.113021850585938, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8542192578315735, "num_tokens": 144893694.0, "step": 3797 }, { "epoch": 0.4831446380867574, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.5418701171875, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8481418490409851, "num_tokens": 144932985.0, "step": 3798 }, { "epoch": 0.4832718483653479, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.941173553466797, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8512166738510132, "num_tokens": 144967823.0, "step": 3799 }, { "epoch": 0.48339905864393845, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.376270294189453, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8724243640899658, "num_tokens": 145007750.0, "step": 3800 }, { "epoch": 0.4835262689225289, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.459300994873047, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.845889687538147, "num_tokens": 145048596.0, "step": 3801 }, { "epoch": 0.48365347920111945, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.029014587402344, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.841960608959198, "num_tokens": 145080868.0, "step": 3802 }, { "epoch": 0.48378068947971, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.186555862426758, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8602353930473328, "num_tokens": 145115865.0, "step": 3803 }, { "epoch": 0.48390789975830045, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.233762741088867, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8656211495399475, "num_tokens": 145151748.0, "step": 3804 }, { "epoch": 0.484035110036891, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 26.964801788330078, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.852330207824707, "num_tokens": 145187074.0, "step": 3805 }, { "epoch": 0.4841623203154815, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.37965202331543, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8690149784088135, "num_tokens": 145224855.0, "step": 3806 }, { "epoch": 0.484289530594072, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 26.647977828979492, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.861789882183075, "num_tokens": 145266538.0, "step": 3807 }, { "epoch": 0.4844167408726625, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.487749099731445, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8573703169822693, "num_tokens": 145303560.0, "step": 3808 }, { "epoch": 0.48454395115125304, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.426191329956055, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8523762822151184, "num_tokens": 145345985.0, "step": 3809 }, { "epoch": 0.4846711614298435, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.08716583251953, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.84229576587677, "num_tokens": 145382132.0, "step": 3810 }, { "epoch": 0.48479837170843404, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.874656677246094, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8423737287521362, "num_tokens": 145418992.0, "step": 3811 }, { "epoch": 0.48492558198702457, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.601478576660156e-05, "grad_norm": 26.961719512939453, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8521336317062378, "num_tokens": 145455892.0, "step": 3812 }, { "epoch": 0.48505279226561504, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.74652099609375, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8496484756469727, "num_tokens": 145497217.0, "step": 3813 }, { "epoch": 0.48518000254420557, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.87200927734375, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.858555257320404, "num_tokens": 145533822.0, "step": 3814 }, { "epoch": 0.4853072128227961, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.357389450073242, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8712868094444275, "num_tokens": 145572967.0, "step": 3815 }, { "epoch": 0.48543442310138657, "ewc_loss": 0.05859375, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.212783813476562, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8668975830078125, "num_tokens": 145609436.0, "step": 3816 }, { "epoch": 0.4855616333799771, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.226490020751953, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8645844459533691, "num_tokens": 145645083.0, "step": 3817 }, { "epoch": 0.4856888436585676, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.244470596313477, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8334290981292725, "num_tokens": 145680249.0, "step": 3818 }, { "epoch": 0.4858160539371581, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.165605545043945, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8571915030479431, "num_tokens": 145718569.0, "step": 3819 }, { "epoch": 0.4859432642157486, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.238195419311523, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8367264866828918, "num_tokens": 145751868.0, "step": 3820 }, { "epoch": 0.48607047449433916, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.3038272857666, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.856488823890686, "num_tokens": 145787881.0, "step": 3821 }, { "epoch": 0.48619768477292963, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.38025665283203, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8599868416786194, "num_tokens": 145824010.0, "step": 3822 }, { "epoch": 0.48632489505152016, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.246883392333984, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8587507605552673, "num_tokens": 145860981.0, "step": 3823 }, { "epoch": 0.4864521053301107, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.078323364257812, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.855980634689331, "num_tokens": 145900687.0, "step": 3824 }, { "epoch": 0.48657931560870116, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.51165199279785, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8431187868118286, "num_tokens": 145938888.0, "step": 3825 }, { "epoch": 0.4867065258872917, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 26.919971466064453, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8481754660606384, "num_tokens": 145987060.0, "step": 3826 }, { "epoch": 0.4868337361658822, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.601417541503906, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8484609723091125, "num_tokens": 146026447.0, "step": 3827 }, { "epoch": 0.4869609464444727, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.47188949584961, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8558688163757324, "num_tokens": 146067099.0, "step": 3828 }, { "epoch": 0.4870881567230632, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.964996337890625, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.839528501033783, "num_tokens": 146110767.0, "step": 3829 }, { "epoch": 0.48721536700165374, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 26.682714462280273, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8439669609069824, "num_tokens": 146147603.0, "step": 3830 }, { "epoch": 0.48734257728024427, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.56134796142578, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8622995615005493, "num_tokens": 146188821.0, "step": 3831 }, { "epoch": 0.48746978755883474, "ewc_loss": 0.058837890625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.6253204345703125e-05, "grad_norm": 27.405929565429688, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8490642309188843, "num_tokens": 146231886.0, "step": 3832 }, { "epoch": 0.48759699783742527, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.531597137451172, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8294267058372498, "num_tokens": 146271965.0, "step": 3833 }, { "epoch": 0.4877242081160158, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.24968719482422, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8401803970336914, "num_tokens": 146316686.0, "step": 3834 }, { "epoch": 0.4878514183946063, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.291208267211914, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8431164026260376, "num_tokens": 146355098.0, "step": 3835 }, { "epoch": 0.4879786286731968, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.598554611206055, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8577820062637329, "num_tokens": 146390666.0, "step": 3836 }, { "epoch": 0.48810583895178733, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.25957489013672, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8601431846618652, "num_tokens": 146426740.0, "step": 3837 }, { "epoch": 0.4882330492303778, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.660707473754883, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8440725803375244, "num_tokens": 146462668.0, "step": 3838 }, { "epoch": 0.48836025950896833, "ewc_loss": 0.05908203125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.649162292480469e-05, "grad_norm": 27.09800910949707, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.845824122428894, "num_tokens": 146500160.0, "step": 3839 }, { "epoch": 0.48848746978755886, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.40838050842285, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8483139872550964, "num_tokens": 146542963.0, "step": 3840 }, { "epoch": 0.48861468006614933, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.295989990234375, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8591504693031311, "num_tokens": 146580649.0, "step": 3841 }, { "epoch": 0.48874189034473986, "ewc_loss": 0.059326171875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.673004150390625e-05, "grad_norm": 27.08315086364746, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8592842221260071, "num_tokens": 146620823.0, "step": 3842 }, { "epoch": 0.4888691006233304, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.15367889404297, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.862366795539856, "num_tokens": 146663700.0, "step": 3843 }, { "epoch": 0.48899631090192086, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.35725212097168, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8472954630851746, "num_tokens": 146698211.0, "step": 3844 }, { "epoch": 0.4891235211805114, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.431821823120117, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.850451648235321, "num_tokens": 146738380.0, "step": 3845 }, { "epoch": 0.4892507314591019, "ewc_loss": 0.060302734375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.76837158203125e-05, "grad_norm": 27.324270248413086, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8548099994659424, "num_tokens": 146775413.0, "step": 3846 }, { "epoch": 0.4893779417376924, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.263568878173828, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8538477420806885, "num_tokens": 146814774.0, "step": 3847 }, { "epoch": 0.4895051520162829, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.350440979003906, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8623710870742798, "num_tokens": 146853957.0, "step": 3848 }, { "epoch": 0.48963236229487345, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.474994659423828, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8445099592208862, "num_tokens": 146889060.0, "step": 3849 }, { "epoch": 0.4897595725734639, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.10970115661621, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8634517192840576, "num_tokens": 146929849.0, "step": 3850 }, { "epoch": 0.48988678285205445, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.426198959350586, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8541502952575684, "num_tokens": 146968376.0, "step": 3851 }, { "epoch": 0.490013993130645, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.2524471282959, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8376237750053406, "num_tokens": 147002871.0, "step": 3852 }, { "epoch": 0.49014120340923545, "ewc_loss": 0.060302734375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.76837158203125e-05, "grad_norm": 27.462596893310547, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8494154214859009, "num_tokens": 147039412.0, "step": 3853 }, { "epoch": 0.490268413687826, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.242431640625, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.856654167175293, "num_tokens": 147080307.0, "step": 3854 }, { "epoch": 0.4903956239664165, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.467941284179688, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8594406843185425, "num_tokens": 147120131.0, "step": 3855 }, { "epoch": 0.490522834245007, "ewc_loss": 0.0595703125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.696846008300781e-05, "grad_norm": 27.297317504882812, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8450537919998169, "num_tokens": 147155760.0, "step": 3856 }, { "epoch": 0.4906500445235975, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.491670608520508, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8564618825912476, "num_tokens": 147195406.0, "step": 3857 }, { "epoch": 0.49077725480218803, "ewc_loss": 0.059814453125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.7206878662109375e-05, "grad_norm": 27.0727481842041, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8648577928543091, "num_tokens": 147235688.0, "step": 3858 }, { "epoch": 0.4909044650807785, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.38323211669922, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8616741895675659, "num_tokens": 147271575.0, "step": 3859 }, { "epoch": 0.49103167535936904, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.391977310180664, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8499489426612854, "num_tokens": 147315473.0, "step": 3860 }, { "epoch": 0.49115888563795956, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.631603240966797, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8570226430892944, "num_tokens": 147359058.0, "step": 3861 }, { "epoch": 0.49128609591655004, "ewc_loss": 0.060302734375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.76837158203125e-05, "grad_norm": 27.289806365966797, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8571555614471436, "num_tokens": 147394443.0, "step": 3862 }, { "epoch": 0.49141330619514056, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.255571365356445, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8481596112251282, "num_tokens": 147436225.0, "step": 3863 }, { "epoch": 0.4915405164737311, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.541040420532227, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8613277673721313, "num_tokens": 147473449.0, "step": 3864 }, { "epoch": 0.49166772675232157, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.028297424316406, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8508489727973938, "num_tokens": 147510330.0, "step": 3865 }, { "epoch": 0.4917949370309121, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.492422103881836, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8641630411148071, "num_tokens": 147544683.0, "step": 3866 }, { "epoch": 0.4919221473095026, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 26.870569229125977, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8712566494941711, "num_tokens": 147589330.0, "step": 3867 }, { "epoch": 0.4920493575880931, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.633386611938477, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8222349882125854, "num_tokens": 147626864.0, "step": 3868 }, { "epoch": 0.4921765678666836, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.19461441040039, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8341392278671265, "num_tokens": 147672006.0, "step": 3869 }, { "epoch": 0.49230377814527415, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.536861419677734, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8442445993423462, "num_tokens": 147717388.0, "step": 3870 }, { "epoch": 0.4924309884238646, "ewc_loss": 0.06005859375, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.744529724121094e-05, "grad_norm": 27.173843383789062, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.840373694896698, "num_tokens": 147759827.0, "step": 3871 }, { "epoch": 0.49255819870245515, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.540481567382812, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8325831890106201, "num_tokens": 147793742.0, "step": 3872 }, { "epoch": 0.4926854089810457, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.124055862426758, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8625550270080566, "num_tokens": 147829881.0, "step": 3873 }, { "epoch": 0.49281261925963615, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.370256423950195, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8381385207176208, "num_tokens": 147875051.0, "step": 3874 }, { "epoch": 0.4929398295382267, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.101408004760742, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8455904126167297, "num_tokens": 147912786.0, "step": 3875 }, { "epoch": 0.4930670398168172, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.44493293762207, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8318151235580444, "num_tokens": 147954981.0, "step": 3876 }, { "epoch": 0.4931942500954077, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.258880615234375, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8455651998519897, "num_tokens": 147992179.0, "step": 3877 }, { "epoch": 0.4933214603739982, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.19205665588379, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8568087220191956, "num_tokens": 148029866.0, "step": 3878 }, { "epoch": 0.49344867065258874, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.52867317199707, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8540557622909546, "num_tokens": 148070057.0, "step": 3879 }, { "epoch": 0.4935758809311792, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 26.964378356933594, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8561980128288269, "num_tokens": 148108461.0, "step": 3880 }, { "epoch": 0.49370309120976974, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.537656784057617, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8463990688323975, "num_tokens": 148147609.0, "step": 3881 }, { "epoch": 0.49383030148836027, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 26.781824111938477, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8476871252059937, "num_tokens": 148190475.0, "step": 3882 }, { "epoch": 0.4939575117669508, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.667465209960938, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8643909692764282, "num_tokens": 148221187.0, "step": 3883 }, { "epoch": 0.49408472204554127, "ewc_loss": 0.060302734375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.76837158203125e-05, "grad_norm": 26.67570686340332, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8616458773612976, "num_tokens": 148260916.0, "step": 3884 }, { "epoch": 0.4942119323241318, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.71482276916504, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8591945171356201, "num_tokens": 148295591.0, "step": 3885 }, { "epoch": 0.4943391426027223, "ewc_loss": 0.060546875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 26.820846557617188, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8453061580657959, "num_tokens": 148333445.0, "step": 3886 }, { "epoch": 0.4944663528813128, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.534101486206055, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8618457913398743, "num_tokens": 148367951.0, "step": 3887 }, { "epoch": 0.4945935631599033, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 26.970611572265625, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8442916870117188, "num_tokens": 148408491.0, "step": 3888 }, { "epoch": 0.49472077343849385, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.34055519104004, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8525810241699219, "num_tokens": 148443883.0, "step": 3889 }, { "epoch": 0.4948479837170843, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.04159927368164, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8539701104164124, "num_tokens": 148480511.0, "step": 3890 }, { "epoch": 0.49497519399567486, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.2081356048584, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8454166650772095, "num_tokens": 148514759.0, "step": 3891 }, { "epoch": 0.4951024042742654, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.526647567749023, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8537777066230774, "num_tokens": 148553209.0, "step": 3892 }, { "epoch": 0.49522961455285586, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.09939193725586, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8432286381721497, "num_tokens": 148590387.0, "step": 3893 }, { "epoch": 0.4953568248314464, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.806415557861328, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.849849283695221, "num_tokens": 148625201.0, "step": 3894 }, { "epoch": 0.4954840351100369, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.19539451599121, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.856268048286438, "num_tokens": 148665989.0, "step": 3895 }, { "epoch": 0.4956112453886274, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.41385269165039, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.855097770690918, "num_tokens": 148704839.0, "step": 3896 }, { "epoch": 0.4957384556672179, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.09263801574707, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8627287149429321, "num_tokens": 148740523.0, "step": 3897 }, { "epoch": 0.49586566594580844, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.453706741333008, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.868148684501648, "num_tokens": 148779103.0, "step": 3898 }, { "epoch": 0.4959928762243989, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.26112937927246, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8630697727203369, "num_tokens": 148814659.0, "step": 3899 }, { "epoch": 0.49612008650298944, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.225324630737305, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.872604250907898, "num_tokens": 148856864.0, "step": 3900 }, { "epoch": 0.49624729678157997, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.04403305053711, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8587052226066589, "num_tokens": 148890309.0, "step": 3901 }, { "epoch": 0.49637450706017044, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.08094596862793, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8368290662765503, "num_tokens": 148926662.0, "step": 3902 }, { "epoch": 0.496501717338761, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.517749786376953, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8399304151535034, "num_tokens": 148963032.0, "step": 3903 }, { "epoch": 0.4966289276173515, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 26.987340927124023, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8580000400543213, "num_tokens": 148996684.0, "step": 3904 }, { "epoch": 0.496756137895942, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.02790069580078, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8530101776123047, "num_tokens": 149032735.0, "step": 3905 }, { "epoch": 0.4968833481745325, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.356966018676758, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.853371262550354, "num_tokens": 149069415.0, "step": 3906 }, { "epoch": 0.49701055845312303, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.227153778076172, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8383745551109314, "num_tokens": 149115042.0, "step": 3907 }, { "epoch": 0.4971377687317135, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 26.885046005249023, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8607711791992188, "num_tokens": 149149767.0, "step": 3908 }, { "epoch": 0.49726497901030403, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.469968795776367, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8466171026229858, "num_tokens": 149193000.0, "step": 3909 }, { "epoch": 0.49739218928889456, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.072246551513672, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8501464128494263, "num_tokens": 149226963.0, "step": 3910 }, { "epoch": 0.49751939956748503, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.06089210510254, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8475291132926941, "num_tokens": 149268139.0, "step": 3911 }, { "epoch": 0.49764660984607556, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.13753890991211, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.860012948513031, "num_tokens": 149305882.0, "step": 3912 }, { "epoch": 0.4977738201246661, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.079364776611328, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8494157791137695, "num_tokens": 149344119.0, "step": 3913 }, { "epoch": 0.49790103040325656, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.24555206298828, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8435919284820557, "num_tokens": 149384102.0, "step": 3914 }, { "epoch": 0.4980282406818471, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.13185691833496, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8500529527664185, "num_tokens": 149425677.0, "step": 3915 }, { "epoch": 0.4981554509604376, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.067890167236328, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.848354697227478, "num_tokens": 149463275.0, "step": 3916 }, { "epoch": 0.4982826612390281, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.206111907958984, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8664765357971191, "num_tokens": 149503950.0, "step": 3917 }, { "epoch": 0.4984098715176186, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.21934700012207, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.844360888004303, "num_tokens": 149541192.0, "step": 3918 }, { "epoch": 0.49853708179620915, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.12119483947754, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8492860794067383, "num_tokens": 149581784.0, "step": 3919 }, { "epoch": 0.4986642920747996, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.44852066040039, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8568261861801147, "num_tokens": 149619312.0, "step": 3920 }, { "epoch": 0.49879150235339015, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.00419044494629, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.849498987197876, "num_tokens": 149657443.0, "step": 3921 }, { "epoch": 0.4989187126319807, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.547273635864258, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8636908531188965, "num_tokens": 149694992.0, "step": 3922 }, { "epoch": 0.49904592291057115, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 26.834623336791992, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8475072979927063, "num_tokens": 149729554.0, "step": 3923 }, { "epoch": 0.4991731331891617, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.52620506286621, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8590315580368042, "num_tokens": 149770051.0, "step": 3924 }, { "epoch": 0.4993003434677522, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.161069869995117, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8212366104125977, "num_tokens": 149806724.0, "step": 3925 }, { "epoch": 0.4994275537463427, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.331226348876953, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8439476490020752, "num_tokens": 149849874.0, "step": 3926 }, { "epoch": 0.4995547640249332, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.219057083129883, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8518319129943848, "num_tokens": 149888076.0, "step": 3927 }, { "epoch": 0.49968197430352373, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.442127227783203, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8528954982757568, "num_tokens": 149923750.0, "step": 3928 }, { "epoch": 0.4998091845821142, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.18758773803711, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8402751684188843, "num_tokens": 149960657.0, "step": 3929 }, { "epoch": 0.49993639486070474, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.326934814453125, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.867417573928833, "num_tokens": 149995983.0, "step": 3930 }, { "epoch": 0.5000636051392953, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.252273559570312, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8307497501373291, "num_tokens": 150037508.0, "step": 3931 }, { "epoch": 0.5001908154178858, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.334278106689453, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8472111225128174, "num_tokens": 150077806.0, "step": 3932 }, { "epoch": 0.5003180256964763, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.255300521850586, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8507277965545654, "num_tokens": 150117589.0, "step": 3933 }, { "epoch": 0.5004452359750667, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.13931655883789, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8471841216087341, "num_tokens": 150161579.0, "step": 3934 }, { "epoch": 0.5005724462536573, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.452497482299805, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8387659788131714, "num_tokens": 150208018.0, "step": 3935 }, { "epoch": 0.5006996565322478, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.29353904724121, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.854390025138855, "num_tokens": 150250164.0, "step": 3936 }, { "epoch": 0.5008268668108383, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.410371780395508, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8703441619873047, "num_tokens": 150289972.0, "step": 3937 }, { "epoch": 0.5009540770894289, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.21607780456543, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8506691455841064, "num_tokens": 150333656.0, "step": 3938 }, { "epoch": 0.5010812873680194, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.393844604492188, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8658581376075745, "num_tokens": 150374086.0, "step": 3939 }, { "epoch": 0.5012084976466098, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.31245994567871, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8560497164726257, "num_tokens": 150407472.0, "step": 3940 }, { "epoch": 0.5013357079252003, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.511125564575195, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8570512533187866, "num_tokens": 150444560.0, "step": 3941 }, { "epoch": 0.5014629182037909, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.20186424255371, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8572216033935547, "num_tokens": 150483621.0, "step": 3942 }, { "epoch": 0.5015901284823814, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.3953914642334, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8519266843795776, "num_tokens": 150520414.0, "step": 3943 }, { "epoch": 0.5017173387609719, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.368024826049805, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.84537672996521, "num_tokens": 150556589.0, "step": 3944 }, { "epoch": 0.5018445490395624, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.606098175048828, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8505415916442871, "num_tokens": 150592393.0, "step": 3945 }, { "epoch": 0.5019717593181529, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.42660140991211, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8390161991119385, "num_tokens": 150630457.0, "step": 3946 }, { "epoch": 0.5020989695967434, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.111919403076172, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8609458804130554, "num_tokens": 150659482.0, "step": 3947 }, { "epoch": 0.5022261798753339, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.769012451171875, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.85285484790802, "num_tokens": 150700920.0, "step": 3948 }, { "epoch": 0.5023533901539244, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.070985794067383, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8626325726509094, "num_tokens": 150736393.0, "step": 3949 }, { "epoch": 0.502480600432515, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.770748138427734, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8490283489227295, "num_tokens": 150783848.0, "step": 3950 }, { "epoch": 0.5026078107111055, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.24452781677246, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8492498397827148, "num_tokens": 150824179.0, "step": 3951 }, { "epoch": 0.5027350209896959, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.35987663269043, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8520990610122681, "num_tokens": 150862775.0, "step": 3952 }, { "epoch": 0.5028622312682864, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.631832122802734, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8512775897979736, "num_tokens": 150900005.0, "step": 3953 }, { "epoch": 0.502989441546877, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.606718063354492, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8487828969955444, "num_tokens": 150936148.0, "step": 3954 }, { "epoch": 0.5031166518254675, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.385086059570312, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8490046262741089, "num_tokens": 150971661.0, "step": 3955 }, { "epoch": 0.503243862104058, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.780399322509766, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8640372157096863, "num_tokens": 151009521.0, "step": 3956 }, { "epoch": 0.5033710723826486, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.399595260620117, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8484725952148438, "num_tokens": 151053641.0, "step": 3957 }, { "epoch": 0.5034982826612391, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.670684814453125, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8571789860725403, "num_tokens": 151094476.0, "step": 3958 }, { "epoch": 0.5036254929398295, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.745986938476562, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8462904095649719, "num_tokens": 151126381.0, "step": 3959 }, { "epoch": 0.50375270321842, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.315650939941406, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8580018281936646, "num_tokens": 151156875.0, "step": 3960 }, { "epoch": 0.5038799134970106, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.616615295410156, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8484562039375305, "num_tokens": 151194394.0, "step": 3961 }, { "epoch": 0.5040071237756011, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.48339080810547, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8379150629043579, "num_tokens": 151234472.0, "step": 3962 }, { "epoch": 0.5041343340541916, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.32176971435547, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8479586839675903, "num_tokens": 151270426.0, "step": 3963 }, { "epoch": 0.5042615443327821, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.593624114990234, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8504951000213623, "num_tokens": 151305793.0, "step": 3964 }, { "epoch": 0.5043887546113726, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.542280197143555, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8390771150588989, "num_tokens": 151345116.0, "step": 3965 }, { "epoch": 0.5045159648899631, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.736221313476562, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8565718531608582, "num_tokens": 151381901.0, "step": 3966 }, { "epoch": 0.5046431751685536, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.67486572265625, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8547144532203674, "num_tokens": 151426884.0, "step": 3967 }, { "epoch": 0.5047703854471441, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.411348342895508, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8453675508499146, "num_tokens": 151467596.0, "step": 3968 }, { "epoch": 0.5048975957257347, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.869775772094727, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.871626615524292, "num_tokens": 151502131.0, "step": 3969 }, { "epoch": 0.5050248060043252, "ewc_loss": 0.060791015625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.792213439941406e-05, "grad_norm": 27.118070602416992, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8488022089004517, "num_tokens": 151538932.0, "step": 3970 }, { "epoch": 0.5051520162829156, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.646223068237305, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8503921031951904, "num_tokens": 151577839.0, "step": 3971 }, { "epoch": 0.5052792265615061, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.81525993347168, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8513014316558838, "num_tokens": 151618984.0, "step": 3972 }, { "epoch": 0.5054064368400967, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.128690719604492, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8528077006340027, "num_tokens": 151658392.0, "step": 3973 }, { "epoch": 0.5055336471186872, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.547313690185547, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8612781763076782, "num_tokens": 151689781.0, "step": 3974 }, { "epoch": 0.5056608573972777, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.368919372558594, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8371666669845581, "num_tokens": 151729612.0, "step": 3975 }, { "epoch": 0.5057880676758683, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.357431411743164, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8394652605056763, "num_tokens": 151769270.0, "step": 3976 }, { "epoch": 0.5059152779544587, "ewc_loss": 0.0615234375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.863739013671875e-05, "grad_norm": 27.717453002929688, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8329806923866272, "num_tokens": 151803936.0, "step": 3977 }, { "epoch": 0.5060424882330492, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.594112396240234, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.840228796005249, "num_tokens": 151840429.0, "step": 3978 }, { "epoch": 0.5061696985116397, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.515954971313477, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8532365560531616, "num_tokens": 151883296.0, "step": 3979 }, { "epoch": 0.5062969087902303, "ewc_loss": 0.06103515625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.8160552978515625e-05, "grad_norm": 27.82916259765625, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.866597056388855, "num_tokens": 151917658.0, "step": 3980 }, { "epoch": 0.5064241190688208, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.142215728759766, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8511213660240173, "num_tokens": 151958822.0, "step": 3981 }, { "epoch": 0.5065513293474113, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.582021713256836, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8469511270523071, "num_tokens": 151994964.0, "step": 3982 }, { "epoch": 0.5066785396260017, "ewc_loss": 0.061279296875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.839897155761719e-05, "grad_norm": 27.09135627746582, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8477165699005127, "num_tokens": 152037013.0, "step": 3983 }, { "epoch": 0.5068057499045923, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.479026794433594, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8476415276527405, "num_tokens": 152074821.0, "step": 3984 }, { "epoch": 0.5069329601831828, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.33470344543457, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8454774022102356, "num_tokens": 152116667.0, "step": 3985 }, { "epoch": 0.5070601704617733, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.032379150390625, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8555589318275452, "num_tokens": 152152714.0, "step": 3986 }, { "epoch": 0.5071873807403638, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.732145309448242, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8380244970321655, "num_tokens": 152190230.0, "step": 3987 }, { "epoch": 0.5073145910189544, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.31412696838379, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8503305912017822, "num_tokens": 152228887.0, "step": 3988 }, { "epoch": 0.5074418012975448, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.8173885345459, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8612086176872253, "num_tokens": 152271863.0, "step": 3989 }, { "epoch": 0.5075690115761353, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.12859344482422, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.845980703830719, "num_tokens": 152310765.0, "step": 3990 }, { "epoch": 0.5076962218547258, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.726839065551758, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8735576868057251, "num_tokens": 152342545.0, "step": 3991 }, { "epoch": 0.5078234321333164, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.381025314331055, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8358148336410522, "num_tokens": 152386223.0, "step": 3992 }, { "epoch": 0.5079506424119069, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.637794494628906, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.858991265296936, "num_tokens": 152421811.0, "step": 3993 }, { "epoch": 0.5080778526904974, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.73357582092285, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8379016518592834, "num_tokens": 152454581.0, "step": 3994 }, { "epoch": 0.5082050629690879, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.689273834228516, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8412914276123047, "num_tokens": 152490591.0, "step": 3995 }, { "epoch": 0.5083322732476784, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.629037857055664, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8737883567810059, "num_tokens": 152532290.0, "step": 3996 }, { "epoch": 0.5084594835262689, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.676624298095703, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8575988411903381, "num_tokens": 152571984.0, "step": 3997 }, { "epoch": 0.5085866938048594, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.554967880249023, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8519355058670044, "num_tokens": 152611082.0, "step": 3998 }, { "epoch": 0.50871390408345, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.55928611755371, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8598315119743347, "num_tokens": 152655303.0, "step": 3999 }, { "epoch": 0.5088411143620405, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.672849655151367, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8509582281112671, "num_tokens": 152688432.0, "step": 4000 }, { "epoch": 0.5089683246406309, "ewc_loss": 0.061767578125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.887580871582031e-05, "grad_norm": 27.706830978393555, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8461905121803284, "num_tokens": 152725637.0, "step": 4001 }, { "epoch": 0.5090955349192214, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.7886905670166, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8500347137451172, "num_tokens": 152763108.0, "step": 4002 }, { "epoch": 0.509222745197812, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.614299774169922, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8587602972984314, "num_tokens": 152800659.0, "step": 4003 }, { "epoch": 0.5093499554764025, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.61368751525879, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8429845571517944, "num_tokens": 152842797.0, "step": 4004 }, { "epoch": 0.509477165754993, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.551538467407227, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.84254390001297, "num_tokens": 152888257.0, "step": 4005 }, { "epoch": 0.5096043760335836, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.914701461791992, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.851954460144043, "num_tokens": 152929089.0, "step": 4006 }, { "epoch": 0.5097315863121741, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.433462142944336, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8461658358573914, "num_tokens": 152972710.0, "step": 4007 }, { "epoch": 0.5098587965907645, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.852819442749023, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8526641130447388, "num_tokens": 153008867.0, "step": 4008 }, { "epoch": 0.509986006869355, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.57588005065918, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8463097810745239, "num_tokens": 153051497.0, "step": 4009 }, { "epoch": 0.5101132171479456, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.80759620666504, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8686042428016663, "num_tokens": 153089295.0, "step": 4010 }, { "epoch": 0.5102404274265361, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.588520050048828, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8558607697486877, "num_tokens": 153123957.0, "step": 4011 }, { "epoch": 0.5103676377051266, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.866924285888672, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8485708236694336, "num_tokens": 153165248.0, "step": 4012 }, { "epoch": 0.5104948479837171, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.860353469848633, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8662936091423035, "num_tokens": 153206163.0, "step": 4013 }, { "epoch": 0.5106220582623076, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.883163452148438, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8424818515777588, "num_tokens": 153248666.0, "step": 4014 }, { "epoch": 0.5107492685408981, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.734235763549805, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8747168779373169, "num_tokens": 153289009.0, "step": 4015 }, { "epoch": 0.5108764788194886, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.783082962036133, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8566936254501343, "num_tokens": 153321354.0, "step": 4016 }, { "epoch": 0.5110036890980791, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.486299514770508, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8512107729911804, "num_tokens": 153361061.0, "step": 4017 }, { "epoch": 0.5111308993766697, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.532102584838867, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8456587791442871, "num_tokens": 153397002.0, "step": 4018 }, { "epoch": 0.5112581096552602, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.678327560424805, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8379037380218506, "num_tokens": 153435264.0, "step": 4019 }, { "epoch": 0.5113853199338506, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.375825881958008, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8506031036376953, "num_tokens": 153474751.0, "step": 4020 }, { "epoch": 0.5115125302124411, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.613147735595703, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8602124452590942, "num_tokens": 153510084.0, "step": 4021 }, { "epoch": 0.5116397404910317, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.17510223388672, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8372131586074829, "num_tokens": 153550387.0, "step": 4022 }, { "epoch": 0.5117669507696222, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.37566375732422, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8734333515167236, "num_tokens": 153592441.0, "step": 4023 }, { "epoch": 0.5118941610482127, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.82875633239746, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8593227863311768, "num_tokens": 153629202.0, "step": 4024 }, { "epoch": 0.5120213713268033, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.43313980102539, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8464828729629517, "num_tokens": 153669243.0, "step": 4025 }, { "epoch": 0.5121485816053937, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.776092529296875, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8467149138450623, "num_tokens": 153709206.0, "step": 4026 }, { "epoch": 0.5122757918839842, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.69647216796875, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8584442138671875, "num_tokens": 153741251.0, "step": 4027 }, { "epoch": 0.5124030021625747, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.76542091369629, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8370336890220642, "num_tokens": 153775559.0, "step": 4028 }, { "epoch": 0.5125302124411653, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.410350799560547, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8561044931411743, "num_tokens": 153819158.0, "step": 4029 }, { "epoch": 0.5126574227197558, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.71965217590332, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8443874716758728, "num_tokens": 153856231.0, "step": 4030 }, { "epoch": 0.5127846329983463, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.164443969726562, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8624098896980286, "num_tokens": 153897125.0, "step": 4031 }, { "epoch": 0.5129118432769367, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.036176681518555, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.857251763343811, "num_tokens": 153933552.0, "step": 4032 }, { "epoch": 0.5130390535555273, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.130765914916992, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8368414044380188, "num_tokens": 153974337.0, "step": 4033 }, { "epoch": 0.5131662638341178, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.790510177612305, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.845584511756897, "num_tokens": 154018911.0, "step": 4034 }, { "epoch": 0.5132934741127083, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.2082462310791, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.860863983631134, "num_tokens": 154056441.0, "step": 4035 }, { "epoch": 0.5134206843912988, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.860191345214844, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8416885137557983, "num_tokens": 154096326.0, "step": 4036 }, { "epoch": 0.5135478946698894, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.328357696533203, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8565042614936829, "num_tokens": 154137223.0, "step": 4037 }, { "epoch": 0.5136751049484798, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.488679885864258, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8503332138061523, "num_tokens": 154168312.0, "step": 4038 }, { "epoch": 0.5138023152270703, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.46554946899414, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8654367923736572, "num_tokens": 154203125.0, "step": 4039 }, { "epoch": 0.5139295255056608, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.38395118713379, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8466835021972656, "num_tokens": 154248437.0, "step": 4040 }, { "epoch": 0.5140567357842514, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.419824600219727, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8554064035415649, "num_tokens": 154286303.0, "step": 4041 }, { "epoch": 0.5141839460628419, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.22286033630371, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8411434292793274, "num_tokens": 154328638.0, "step": 4042 }, { "epoch": 0.5143111563414324, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.465335845947266, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8501624464988708, "num_tokens": 154362953.0, "step": 4043 }, { "epoch": 0.5144383666200228, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.398229598999023, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8396399021148682, "num_tokens": 154401289.0, "step": 4044 }, { "epoch": 0.5145655768986134, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.386674880981445, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8556923866271973, "num_tokens": 154438779.0, "step": 4045 }, { "epoch": 0.5146927871772039, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.560707092285156, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8463671207427979, "num_tokens": 154467032.0, "step": 4046 }, { "epoch": 0.5148199974557944, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.636215209960938, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8608745336532593, "num_tokens": 154505222.0, "step": 4047 }, { "epoch": 0.514947207734385, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.405872344970703, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.837779700756073, "num_tokens": 154544953.0, "step": 4048 }, { "epoch": 0.5150744180129755, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.471771240234375, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8636407852172852, "num_tokens": 154586193.0, "step": 4049 }, { "epoch": 0.5152016282915659, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.740190505981445, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8566980361938477, "num_tokens": 154629531.0, "step": 4050 }, { "epoch": 0.5153288385701564, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.2827091217041, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8675846457481384, "num_tokens": 154659482.0, "step": 4051 }, { "epoch": 0.515456048848747, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.362720489501953, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8191100358963013, "num_tokens": 154702645.0, "step": 4052 }, { "epoch": 0.5155832591273375, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.315208435058594, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8603049516677856, "num_tokens": 154743329.0, "step": 4053 }, { "epoch": 0.515710469405928, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.635112762451172, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8479718565940857, "num_tokens": 154783460.0, "step": 4054 }, { "epoch": 0.5158376796845185, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.161996841430664, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8460306525230408, "num_tokens": 154827209.0, "step": 4055 }, { "epoch": 0.5159648899631091, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.72382926940918, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8273601531982422, "num_tokens": 154866577.0, "step": 4056 }, { "epoch": 0.5160921002416995, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.195043563842773, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8541076183319092, "num_tokens": 154912191.0, "step": 4057 }, { "epoch": 0.51621931052029, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.60733985900879, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8386656045913696, "num_tokens": 154953412.0, "step": 4058 }, { "epoch": 0.5163465207988805, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.76607322692871, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8629685044288635, "num_tokens": 154989913.0, "step": 4059 }, { "epoch": 0.5164737310774711, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.448347091674805, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8398487567901611, "num_tokens": 155038094.0, "step": 4060 }, { "epoch": 0.5166009413560616, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.62119483947754, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.842345118522644, "num_tokens": 155075900.0, "step": 4061 }, { "epoch": 0.5167281516346521, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.57994270324707, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8501412272453308, "num_tokens": 155117038.0, "step": 4062 }, { "epoch": 0.5168553619132426, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.41143226623535, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8642123937606812, "num_tokens": 155151442.0, "step": 4063 }, { "epoch": 0.5169825721918331, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.47003746032715, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8603249788284302, "num_tokens": 155192288.0, "step": 4064 }, { "epoch": 0.5171097824704236, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.50564956665039, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8550589084625244, "num_tokens": 155238931.0, "step": 4065 }, { "epoch": 0.5172369927490141, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.368366241455078, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8652612566947937, "num_tokens": 155276644.0, "step": 4066 }, { "epoch": 0.5173642030276047, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.61459732055664, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8640583753585815, "num_tokens": 155307638.0, "step": 4067 }, { "epoch": 0.5174914133061952, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.396671295166016, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.844023585319519, "num_tokens": 155343255.0, "step": 4068 }, { "epoch": 0.5176186235847856, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.524526596069336, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8534750938415527, "num_tokens": 155382283.0, "step": 4069 }, { "epoch": 0.5177458338633761, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.5052433013916, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8293660879135132, "num_tokens": 155427096.0, "step": 4070 }, { "epoch": 0.5178730441419667, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.4282283782959, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8398575186729431, "num_tokens": 155467472.0, "step": 4071 }, { "epoch": 0.5180002544205572, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.635162353515625, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8471434712409973, "num_tokens": 155508350.0, "step": 4072 }, { "epoch": 0.5181274646991477, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.872068405151367, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8744980692863464, "num_tokens": 155547215.0, "step": 4073 }, { "epoch": 0.5182546749777382, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.488704681396484, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8562405109405518, "num_tokens": 155586658.0, "step": 4074 }, { "epoch": 0.5183818852563287, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.846763610839844, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8537915945053101, "num_tokens": 155620944.0, "step": 4075 }, { "epoch": 0.5185090955349192, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.54203224182129, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8496373891830444, "num_tokens": 155662699.0, "step": 4076 }, { "epoch": 0.5186363058135097, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.71597671508789, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8399220705032349, "num_tokens": 155707783.0, "step": 4077 }, { "epoch": 0.5187635160921003, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.784700393676758, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8556323051452637, "num_tokens": 155745227.0, "step": 4078 }, { "epoch": 0.5188907263706908, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.758134841918945, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8520492911338806, "num_tokens": 155786466.0, "step": 4079 }, { "epoch": 0.5190179366492813, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.648202896118164, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8532361388206482, "num_tokens": 155833145.0, "step": 4080 }, { "epoch": 0.5191451469278717, "ewc_loss": 0.062255859375, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.75912094116211, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8431327939033508, "num_tokens": 155869533.0, "step": 4081 }, { "epoch": 0.5192723572064623, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.694541931152344, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8505958318710327, "num_tokens": 155909329.0, "step": 4082 }, { "epoch": 0.5193995674850528, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.739103317260742, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8612356781959534, "num_tokens": 155947848.0, "step": 4083 }, { "epoch": 0.5195267777636433, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.726865768432617, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8551195859909058, "num_tokens": 155982693.0, "step": 4084 }, { "epoch": 0.5196539880422338, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.80547523498535, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8511836528778076, "num_tokens": 156023414.0, "step": 4085 }, { "epoch": 0.5197811983208244, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.893484115600586, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8506213426589966, "num_tokens": 156063488.0, "step": 4086 }, { "epoch": 0.5199084085994148, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.959152221679688, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8515232801437378, "num_tokens": 156103081.0, "step": 4087 }, { "epoch": 0.5200356188780053, "ewc_loss": 0.0625, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.823772430419922, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8429362773895264, "num_tokens": 156139453.0, "step": 4088 }, { "epoch": 0.5201628291565958, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.917966842651367, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.847173810005188, "num_tokens": 156176564.0, "step": 4089 }, { "epoch": 0.5202900394351864, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.4735107421875, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8552175760269165, "num_tokens": 156221278.0, "step": 4090 }, { "epoch": 0.5204172497137769, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 29.018518447875977, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8576633930206299, "num_tokens": 156259124.0, "step": 4091 }, { "epoch": 0.5205444599923674, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.359054565429688, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8419555425643921, "num_tokens": 156297832.0, "step": 4092 }, { "epoch": 0.5206716702709578, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.393381118774414, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.860190749168396, "num_tokens": 156329745.0, "step": 4093 }, { "epoch": 0.5207988805495484, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.62470245361328, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.851722240447998, "num_tokens": 156360154.0, "step": 4094 }, { "epoch": 0.5209260908281389, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.987178802490234, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8546343445777893, "num_tokens": 156399360.0, "step": 4095 }, { "epoch": 0.5210533011067294, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.76321792602539, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.851218581199646, "num_tokens": 156438782.0, "step": 4096 }, { "epoch": 0.52118051138532, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.9421329498291, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8562543392181396, "num_tokens": 156470261.0, "step": 4097 }, { "epoch": 0.5213077216639105, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.916513442993164, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8338143825531006, "num_tokens": 156510493.0, "step": 4098 }, { "epoch": 0.5214349319425009, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.964786529541016, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8533203601837158, "num_tokens": 156545040.0, "step": 4099 }, { "epoch": 0.5215621422210914, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.60111427307129, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8500894904136658, "num_tokens": 156583474.0, "step": 4100 }, { "epoch": 0.521689352499682, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.637348175048828, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8520845174789429, "num_tokens": 156627137.0, "step": 4101 }, { "epoch": 0.5218165627782725, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.986141204833984, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8409178256988525, "num_tokens": 156664726.0, "step": 4102 }, { "epoch": 0.521943773056863, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.569292068481445, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8693174123764038, "num_tokens": 156707014.0, "step": 4103 }, { "epoch": 0.5220709833354535, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.800323486328125, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8482023477554321, "num_tokens": 156747784.0, "step": 4104 }, { "epoch": 0.522198193614044, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.570358276367188, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8504818677902222, "num_tokens": 156784760.0, "step": 4105 }, { "epoch": 0.5223254038926345, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.7868709564209, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8497278690338135, "num_tokens": 156822899.0, "step": 4106 }, { "epoch": 0.522452614171225, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.846052169799805, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8580425977706909, "num_tokens": 156861308.0, "step": 4107 }, { "epoch": 0.5225798244498155, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.6874942779541, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.859059751033783, "num_tokens": 156901981.0, "step": 4108 }, { "epoch": 0.5227070347284061, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.859643936157227, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8566395044326782, "num_tokens": 156942215.0, "step": 4109 }, { "epoch": 0.5228342450069966, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.76812744140625, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8586297631263733, "num_tokens": 156983820.0, "step": 4110 }, { "epoch": 0.5229614552855871, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.64649772644043, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8644051551818848, "num_tokens": 157016215.0, "step": 4111 }, { "epoch": 0.5230886655641775, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.92359161376953, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8511919975280762, "num_tokens": 157052071.0, "step": 4112 }, { "epoch": 0.5232158758427681, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.73106575012207, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8299205899238586, "num_tokens": 157097711.0, "step": 4113 }, { "epoch": 0.5233430861213586, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 27.653873443603516, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8512371778488159, "num_tokens": 157134801.0, "step": 4114 }, { "epoch": 0.5234702963999491, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.982948303222656e-05, "grad_norm": 28.040754318237305, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8530049920082092, "num_tokens": 157171885.0, "step": 4115 }, { "epoch": 0.5235975066785397, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.613574981689453, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8497684597969055, "num_tokens": 157211915.0, "step": 4116 }, { "epoch": 0.5237247169571302, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.722061157226562, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8457652926445007, "num_tokens": 157260564.0, "step": 4117 }, { "epoch": 0.5238519272357206, "ewc_loss": 0.06201171875, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 4.9114227294921875e-05, "grad_norm": 27.796070098876953, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8531127572059631, "num_tokens": 157294676.0, "step": 4118 }, { "epoch": 0.5239791375143111, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.57134246826172, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8625679612159729, "num_tokens": 157334607.0, "step": 4119 }, { "epoch": 0.5241063477929017, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.894378662109375, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.852716326713562, "num_tokens": 157372021.0, "step": 4120 }, { "epoch": 0.5242335580714922, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.556615829467773, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8575524091720581, "num_tokens": 157410092.0, "step": 4121 }, { "epoch": 0.5243607683500827, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.245441436767578, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8493392467498779, "num_tokens": 157447979.0, "step": 4122 }, { "epoch": 0.5244879786286732, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.5748233795166, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8468213677406311, "num_tokens": 157481376.0, "step": 4123 }, { "epoch": 0.5246151889072637, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 28.14427947998047, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8578091859817505, "num_tokens": 157519555.0, "step": 4124 }, { "epoch": 0.5247423991858542, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.935264587402344e-05, "grad_norm": 27.321704864501953, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8494135737419128, "num_tokens": 157551849.0, "step": 4125 }, { "epoch": 0.5248696094644447, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.185697555541992, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8519842028617859, "num_tokens": 157590823.0, "step": 4126 }, { "epoch": 0.5249968197430352, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.545475006103516, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8625739812850952, "num_tokens": 157633771.0, "step": 4127 }, { "epoch": 0.5251240300216258, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.991151809692383, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.86663818359375, "num_tokens": 157667796.0, "step": 4128 }, { "epoch": 0.5252512403002163, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.52873420715332, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8624675273895264, "num_tokens": 157709452.0, "step": 4129 }, { "epoch": 0.5253784505788067, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.260393142700195, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8430211544036865, "num_tokens": 157741407.0, "step": 4130 }, { "epoch": 0.5255056608573972, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.572162628173828, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8634946346282959, "num_tokens": 157777693.0, "step": 4131 }, { "epoch": 0.5256328711359878, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.027849197387695, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8503018617630005, "num_tokens": 157814729.0, "step": 4132 }, { "epoch": 0.5257600814145783, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.74565315246582, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8627687096595764, "num_tokens": 157853698.0, "step": 4133 }, { "epoch": 0.5258872916931688, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.8497314453125, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8574790954589844, "num_tokens": 157888580.0, "step": 4134 }, { "epoch": 0.5260145019717594, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.767192840576172, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8507117033004761, "num_tokens": 157928434.0, "step": 4135 }, { "epoch": 0.5261417122503498, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.015804290771484, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8481405973434448, "num_tokens": 157968936.0, "step": 4136 }, { "epoch": 0.5262689225289403, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.984209060668945, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8414825201034546, "num_tokens": 158005496.0, "step": 4137 }, { "epoch": 0.5263961328075308, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.93541717529297, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8600109815597534, "num_tokens": 158047569.0, "step": 4138 }, { "epoch": 0.5265233430861214, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.263240814208984, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8521801233291626, "num_tokens": 158085733.0, "step": 4139 }, { "epoch": 0.5266505533647119, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.769411087036133, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8418557643890381, "num_tokens": 158123906.0, "step": 4140 }, { "epoch": 0.5267777636433024, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.306671142578125, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.858603835105896, "num_tokens": 158163663.0, "step": 4141 }, { "epoch": 0.5269049739218928, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.7721004486084, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8406798839569092, "num_tokens": 158205878.0, "step": 4142 }, { "epoch": 0.5270321842004834, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.40256118774414, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8559473156929016, "num_tokens": 158241608.0, "step": 4143 }, { "epoch": 0.5271593944790739, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.617324829101562, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.86556077003479, "num_tokens": 158274259.0, "step": 4144 }, { "epoch": 0.5272866047576644, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 28.17813491821289, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.858449399471283, "num_tokens": 158304793.0, "step": 4145 }, { "epoch": 0.527413815036255, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.52279281616211, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.866345226764679, "num_tokens": 158349420.0, "step": 4146 }, { "epoch": 0.5275410253148455, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.358287811279297, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8508509993553162, "num_tokens": 158395834.0, "step": 4147 }, { "epoch": 0.5276682355934359, "ewc_loss": 0.0625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 4.9591064453125e-05, "grad_norm": 27.51120376586914, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8548845052719116, "num_tokens": 158435568.0, "step": 4148 }, { "epoch": 0.5277954458720264, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 28.077091217041016, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8620060086250305, "num_tokens": 158473584.0, "step": 4149 }, { "epoch": 0.527922656150617, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.501567840576172, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8534696102142334, "num_tokens": 158513435.0, "step": 4150 }, { "epoch": 0.5280498664292075, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.21930503845215, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8540685772895813, "num_tokens": 158549478.0, "step": 4151 }, { "epoch": 0.528177076707798, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.647157669067383, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8573538661003113, "num_tokens": 158598742.0, "step": 4152 }, { "epoch": 0.5283042869863885, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.0430908203125, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8416112661361694, "num_tokens": 158636806.0, "step": 4153 }, { "epoch": 0.528431497264979, "ewc_loss": 0.06298828125, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.0067901611328125e-05, "grad_norm": 27.727182388305664, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8460991978645325, "num_tokens": 158676043.0, "step": 4154 }, { "epoch": 0.5285587075435695, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.083969116210938, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8377283811569214, "num_tokens": 158711772.0, "step": 4155 }, { "epoch": 0.52868591782216, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.607568740844727, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8510376214981079, "num_tokens": 158749608.0, "step": 4156 }, { "epoch": 0.5288131281007505, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.069299697875977, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8533079028129578, "num_tokens": 158790502.0, "step": 4157 }, { "epoch": 0.5289403383793411, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.73550033569336, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8284181356430054, "num_tokens": 158828415.0, "step": 4158 }, { "epoch": 0.5290675486579316, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.03569984436035, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8432778120040894, "num_tokens": 158866972.0, "step": 4159 }, { "epoch": 0.5291947589365221, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.64008140563965, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8502074480056763, "num_tokens": 158902991.0, "step": 4160 }, { "epoch": 0.5293219692151125, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.070253372192383, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8603998422622681, "num_tokens": 158945963.0, "step": 4161 }, { "epoch": 0.5294491794937031, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.626867294311523, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8512546420097351, "num_tokens": 158985407.0, "step": 4162 }, { "epoch": 0.5295763897722936, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.183990478515625, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8458075523376465, "num_tokens": 159024387.0, "step": 4163 }, { "epoch": 0.5297036000508841, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.74542999267578, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8448007106781006, "num_tokens": 159061430.0, "step": 4164 }, { "epoch": 0.5298308103294747, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.735435485839844, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.853865921497345, "num_tokens": 159099350.0, "step": 4165 }, { "epoch": 0.5299580206080652, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.273609161376953, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8281110525131226, "num_tokens": 159136276.0, "step": 4166 }, { "epoch": 0.5300852308866556, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.589618682861328, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8738412857055664, "num_tokens": 159177102.0, "step": 4167 }, { "epoch": 0.5302124411652461, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.222124099731445, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8647397756576538, "num_tokens": 159213582.0, "step": 4168 }, { "epoch": 0.5303396514438367, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.433446884155273, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8508727550506592, "num_tokens": 159254283.0, "step": 4169 }, { "epoch": 0.5304668617224272, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.425851821899414, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8541020154953003, "num_tokens": 159288557.0, "step": 4170 }, { "epoch": 0.5305940720010177, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.367929458618164, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8623679280281067, "num_tokens": 159325047.0, "step": 4171 }, { "epoch": 0.5307212822796082, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.021162033081055, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8573868274688721, "num_tokens": 159366291.0, "step": 4172 }, { "epoch": 0.5308484925581987, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.23668670654297, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8466217517852783, "num_tokens": 159401634.0, "step": 4173 }, { "epoch": 0.5309757028367892, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.34379005432129, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8446458578109741, "num_tokens": 159435316.0, "step": 4174 }, { "epoch": 0.5311029131153797, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.963546752929688, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8457539081573486, "num_tokens": 159472192.0, "step": 4175 }, { "epoch": 0.5312301233939702, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.78155517578125, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8603119850158691, "num_tokens": 159515342.0, "step": 4176 }, { "epoch": 0.5313573336725608, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.552928924560547, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8565109372138977, "num_tokens": 159553932.0, "step": 4177 }, { "epoch": 0.5314845439511513, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.6988582611084, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8536209464073181, "num_tokens": 159584711.0, "step": 4178 }, { "epoch": 0.5316117542297417, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.70148277282715, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8409298062324524, "num_tokens": 159618784.0, "step": 4179 }, { "epoch": 0.5317389645083322, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.771461486816406, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8513145446777344, "num_tokens": 159661288.0, "step": 4180 }, { "epoch": 0.5318661747869228, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.647342681884766, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8628840446472168, "num_tokens": 159701286.0, "step": 4181 }, { "epoch": 0.5319933850655133, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.766983032226562, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8398974537849426, "num_tokens": 159746631.0, "step": 4182 }, { "epoch": 0.5321205953441038, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.626022338867188, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8551617860794067, "num_tokens": 159787753.0, "step": 4183 }, { "epoch": 0.5322478056226944, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.770742416381836, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8456349968910217, "num_tokens": 159823092.0, "step": 4184 }, { "epoch": 0.5323750159012848, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.953916549682617, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8429684042930603, "num_tokens": 159858480.0, "step": 4185 }, { "epoch": 0.5325022261798753, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.959745407104492, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.843842625617981, "num_tokens": 159896220.0, "step": 4186 }, { "epoch": 0.5326294364584658, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.98074722290039, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8493378162384033, "num_tokens": 159930995.0, "step": 4187 }, { "epoch": 0.5327566467370564, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.291217803955078, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8476557731628418, "num_tokens": 159968013.0, "step": 4188 }, { "epoch": 0.5328838570156469, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.112506866455078, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8461130857467651, "num_tokens": 160004085.0, "step": 4189 }, { "epoch": 0.5330110672942374, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.39806365966797, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8631147742271423, "num_tokens": 160040606.0, "step": 4190 }, { "epoch": 0.5331382775728278, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.096481323242188, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8654034733772278, "num_tokens": 160080792.0, "step": 4191 }, { "epoch": 0.5332654878514184, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.468647003173828, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8422034382820129, "num_tokens": 160117055.0, "step": 4192 }, { "epoch": 0.5333926981300089, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.9207706451416, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8581151962280273, "num_tokens": 160153223.0, "step": 4193 }, { "epoch": 0.5335199084085994, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.63562774658203, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.835452139377594, "num_tokens": 160190920.0, "step": 4194 }, { "epoch": 0.53364711868719, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.928674697875977, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.849899411201477, "num_tokens": 160221898.0, "step": 4195 }, { "epoch": 0.5337743289657805, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.59612464904785, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8557689189910889, "num_tokens": 160267102.0, "step": 4196 }, { "epoch": 0.5339015392443709, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.78489875793457, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8572499752044678, "num_tokens": 160304422.0, "step": 4197 }, { "epoch": 0.5340287495229614, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.567590713500977, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8503557443618774, "num_tokens": 160339006.0, "step": 4198 }, { "epoch": 0.534155959801552, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.794097900390625, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8728619813919067, "num_tokens": 160377504.0, "step": 4199 }, { "epoch": 0.5342831700801425, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.84742546081543, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8305734395980835, "num_tokens": 160413315.0, "step": 4200 }, { "epoch": 0.534410380358733, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.740188598632812, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8662038445472717, "num_tokens": 160455034.0, "step": 4201 }, { "epoch": 0.5345375906373235, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.849470138549805, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.846420407295227, "num_tokens": 160490505.0, "step": 4202 }, { "epoch": 0.534664800915914, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.07101821899414, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8628793358802795, "num_tokens": 160528712.0, "step": 4203 }, { "epoch": 0.5347920111945045, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.501428604125977, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.843853235244751, "num_tokens": 160567945.0, "step": 4204 }, { "epoch": 0.534919221473095, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.766944885253906, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8595433235168457, "num_tokens": 160596767.0, "step": 4205 }, { "epoch": 0.5350464317516855, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.945585250854492, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8493673801422119, "num_tokens": 160636912.0, "step": 4206 }, { "epoch": 0.5351736420302761, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.905017852783203, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8465642929077148, "num_tokens": 160674588.0, "step": 4207 }, { "epoch": 0.5353008523088666, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.884151458740234, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8417059779167175, "num_tokens": 160713441.0, "step": 4208 }, { "epoch": 0.5354280625874571, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.719181060791016, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8625679016113281, "num_tokens": 160752716.0, "step": 4209 }, { "epoch": 0.5355552728660475, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.028833389282227, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8636387586593628, "num_tokens": 160798771.0, "step": 4210 }, { "epoch": 0.5356824831446381, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.864498138427734, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8703135251998901, "num_tokens": 160843085.0, "step": 4211 }, { "epoch": 0.5358096934232286, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.808412551879883, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8447027802467346, "num_tokens": 160883073.0, "step": 4212 }, { "epoch": 0.5359369037018191, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.182329177856445, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8588321805000305, "num_tokens": 160923734.0, "step": 4213 }, { "epoch": 0.5360641139804097, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.91444206237793, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8496851921081543, "num_tokens": 160963460.0, "step": 4214 }, { "epoch": 0.5361913242590002, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.886241912841797, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8510768413543701, "num_tokens": 161003691.0, "step": 4215 }, { "epoch": 0.5363185345375906, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.901336669921875, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8531687259674072, "num_tokens": 161042056.0, "step": 4216 }, { "epoch": 0.5364457448161811, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.766687393188477, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8756545186042786, "num_tokens": 161078670.0, "step": 4217 }, { "epoch": 0.5365729550947717, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.691675186157227, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8655601143836975, "num_tokens": 161113992.0, "step": 4218 }, { "epoch": 0.5367001653733622, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.715930938720703, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.858116865158081, "num_tokens": 161153763.0, "step": 4219 }, { "epoch": 0.5368273756519527, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.77574348449707, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8374743461608887, "num_tokens": 161184186.0, "step": 4220 }, { "epoch": 0.5369545859305432, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.736356735229492, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8505632281303406, "num_tokens": 161219298.0, "step": 4221 }, { "epoch": 0.5370817962091337, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.64886474609375, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8679232597351074, "num_tokens": 161255252.0, "step": 4222 }, { "epoch": 0.5372090064877242, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 27.75310707092285, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8576631546020508, "num_tokens": 161295890.0, "step": 4223 }, { "epoch": 0.5373362167663147, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.599803924560547, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8504180908203125, "num_tokens": 161330808.0, "step": 4224 }, { "epoch": 0.5374634270449052, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.11101531982422, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8612061142921448, "num_tokens": 161370515.0, "step": 4225 }, { "epoch": 0.5375906373234958, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.645200729370117, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8671342134475708, "num_tokens": 161405257.0, "step": 4226 }, { "epoch": 0.5377178476020863, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.80560302734375, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8542114496231079, "num_tokens": 161443526.0, "step": 4227 }, { "epoch": 0.5378450578806767, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.304563522338867, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8472144603729248, "num_tokens": 161481262.0, "step": 4228 }, { "epoch": 0.5379722681592672, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.76134490966797, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8381662368774414, "num_tokens": 161517756.0, "step": 4229 }, { "epoch": 0.5380994784378578, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.776681900024414, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8612045049667358, "num_tokens": 161552680.0, "step": 4230 }, { "epoch": 0.5382266887164483, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 27.546306610107422, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8493633270263672, "num_tokens": 161588607.0, "step": 4231 }, { "epoch": 0.5383538989950388, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.263837814331055, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.836707592010498, "num_tokens": 161626427.0, "step": 4232 }, { "epoch": 0.5384811092736294, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.238460540771484, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8584418296813965, "num_tokens": 161665182.0, "step": 4233 }, { "epoch": 0.5386083195522198, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.717573165893555, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8538979291915894, "num_tokens": 161704685.0, "step": 4234 }, { "epoch": 0.5387355298308103, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.98678207397461, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.858342707157135, "num_tokens": 161742960.0, "step": 4235 }, { "epoch": 0.5388627401094008, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.09853172302246, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.852082371711731, "num_tokens": 161778199.0, "step": 4236 }, { "epoch": 0.5389899503879914, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.959667205810547, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8489397764205933, "num_tokens": 161811235.0, "step": 4237 }, { "epoch": 0.5391171606665819, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.917011260986328, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8521430492401123, "num_tokens": 161847669.0, "step": 4238 }, { "epoch": 0.5392443709451724, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.030710220336914, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8393523097038269, "num_tokens": 161888470.0, "step": 4239 }, { "epoch": 0.5393715812237628, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.100093841552734, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8338885307312012, "num_tokens": 161924563.0, "step": 4240 }, { "epoch": 0.5394987915023534, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.0175838470459, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8397359848022461, "num_tokens": 161959062.0, "step": 4241 }, { "epoch": 0.5396260017809439, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 27.948970794677734, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8568663597106934, "num_tokens": 161994755.0, "step": 4242 }, { "epoch": 0.5397532120595344, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.128551483154297, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8621507883071899, "num_tokens": 162032812.0, "step": 4243 }, { "epoch": 0.5398804223381249, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.85352897644043, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8555498123168945, "num_tokens": 162067130.0, "step": 4244 }, { "epoch": 0.5400076326167155, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.306087493896484, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.855262041091919, "num_tokens": 162107452.0, "step": 4245 }, { "epoch": 0.5401348428953059, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.938329696655273, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8536034822463989, "num_tokens": 162144220.0, "step": 4246 }, { "epoch": 0.5402620531738964, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.136009216308594, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8611481189727783, "num_tokens": 162181798.0, "step": 4247 }, { "epoch": 0.540389263452487, "ewc_loss": 0.0634765625, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 5.030632019042969e-05, "grad_norm": 28.25253677368164, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8451168537139893, "num_tokens": 162222501.0, "step": 4248 }, { "epoch": 0.5405164737310775, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.801807403564453, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8492051362991333, "num_tokens": 162265533.0, "step": 4249 }, { "epoch": 0.540643684009668, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.05821418762207, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8686549663543701, "num_tokens": 162304984.0, "step": 4250 }, { "epoch": 0.5407708942882585, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.237672805786133, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8429350852966309, "num_tokens": 162342358.0, "step": 4251 }, { "epoch": 0.540898104566849, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.977392196655273, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.853704571723938, "num_tokens": 162377343.0, "step": 4252 }, { "epoch": 0.5410253148454395, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.15479278564453, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8546303510665894, "num_tokens": 162416416.0, "step": 4253 }, { "epoch": 0.54115252512403, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.05401611328125, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8803297281265259, "num_tokens": 162447108.0, "step": 4254 }, { "epoch": 0.5412797354026205, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.463333129882812, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8555405139923096, "num_tokens": 162489659.0, "step": 4255 }, { "epoch": 0.5414069456812111, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.887666702270508, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8441953659057617, "num_tokens": 162525816.0, "step": 4256 }, { "epoch": 0.5415341559598016, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.432809829711914, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8492066264152527, "num_tokens": 162564154.0, "step": 4257 }, { "epoch": 0.5416613662383921, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.016666412353516, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8475213050842285, "num_tokens": 162602728.0, "step": 4258 }, { "epoch": 0.5417885765169825, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.01852798461914, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8583428859710693, "num_tokens": 162636659.0, "step": 4259 }, { "epoch": 0.5419157867955731, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.103662490844727, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8694518804550171, "num_tokens": 162676639.0, "step": 4260 }, { "epoch": 0.5420429970741636, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.12635040283203, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.834956169128418, "num_tokens": 162716763.0, "step": 4261 }, { "epoch": 0.5421702073527541, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.90666961669922, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8442783355712891, "num_tokens": 162753943.0, "step": 4262 }, { "epoch": 0.5422974176313446, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.195180892944336, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8486783504486084, "num_tokens": 162792938.0, "step": 4263 }, { "epoch": 0.5424246279099352, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.045684814453125, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8521983027458191, "num_tokens": 162838938.0, "step": 4264 }, { "epoch": 0.5425518381885256, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.93419075012207, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8595999479293823, "num_tokens": 162875026.0, "step": 4265 }, { "epoch": 0.5426790484671161, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.04840087890625, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8567941188812256, "num_tokens": 162910724.0, "step": 4266 }, { "epoch": 0.5428062587457066, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.103654861450195, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8570797443389893, "num_tokens": 162941602.0, "step": 4267 }, { "epoch": 0.5429334690242972, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.16863250732422, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8404629230499268, "num_tokens": 162980760.0, "step": 4268 }, { "epoch": 0.5430606793028877, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.03481674194336, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8640444874763489, "num_tokens": 163016155.0, "step": 4269 }, { "epoch": 0.5431878895814782, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.074451446533203, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8456339836120605, "num_tokens": 163052519.0, "step": 4270 }, { "epoch": 0.5433150998600687, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.181127548217773, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8624939918518066, "num_tokens": 163091806.0, "step": 4271 }, { "epoch": 0.5434423101386592, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 28.174236297607422, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8695803880691528, "num_tokens": 163128830.0, "step": 4272 }, { "epoch": 0.5435695204172497, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.979568481445312, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8494890928268433, "num_tokens": 163169567.0, "step": 4273 }, { "epoch": 0.5436967306958402, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.488672256469727, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8470257520675659, "num_tokens": 163202913.0, "step": 4274 }, { "epoch": 0.5438239409744308, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.884180068969727, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8611169457435608, "num_tokens": 163236741.0, "step": 4275 }, { "epoch": 0.5439511512530213, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.45371437072754, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8489679098129272, "num_tokens": 163281402.0, "step": 4276 }, { "epoch": 0.5440783615316117, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.187551498413086, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8621907234191895, "num_tokens": 163325288.0, "step": 4277 }, { "epoch": 0.5442055718102022, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.098052978515625, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8433245420455933, "num_tokens": 163367471.0, "step": 4278 }, { "epoch": 0.5443327820887928, "ewc_loss": 0.06396484375, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.054473876953125e-05, "grad_norm": 28.78858184814453, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8540754914283752, "num_tokens": 163400077.0, "step": 4279 }, { "epoch": 0.5444599923673833, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.078315734863281e-05, "grad_norm": 27.90470314025879, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8582919836044312, "num_tokens": 163433739.0, "step": 4280 }, { "epoch": 0.5445872026459738, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.237564086914062, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8542275428771973, "num_tokens": 163467447.0, "step": 4281 }, { "epoch": 0.5447144129245644, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.289487838745117, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8500059843063354, "num_tokens": 163506633.0, "step": 4282 }, { "epoch": 0.5448416232031548, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.425094604492188, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8382164835929871, "num_tokens": 163542773.0, "step": 4283 }, { "epoch": 0.5449688334817453, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.19300651550293, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.820224940776825, "num_tokens": 163585247.0, "step": 4284 }, { "epoch": 0.5450960437603358, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.21950912475586, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8412015438079834, "num_tokens": 163622802.0, "step": 4285 }, { "epoch": 0.5452232540389264, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.293087005615234, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8782442808151245, "num_tokens": 163655980.0, "step": 4286 }, { "epoch": 0.5453504643175169, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.046796798706055, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8596686124801636, "num_tokens": 163691494.0, "step": 4287 }, { "epoch": 0.5454776745961074, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.260215759277344, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8645724058151245, "num_tokens": 163725169.0, "step": 4288 }, { "epoch": 0.5456048848746978, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.31367301940918, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8565293550491333, "num_tokens": 163766050.0, "step": 4289 }, { "epoch": 0.5457320951532884, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.956642150878906, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8463141918182373, "num_tokens": 163799559.0, "step": 4290 }, { "epoch": 0.5458593054318789, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.30712890625, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.847319483757019, "num_tokens": 163837052.0, "step": 4291 }, { "epoch": 0.5459865157104694, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.904306411743164, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8504275679588318, "num_tokens": 163872494.0, "step": 4292 }, { "epoch": 0.5461137259890599, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.16973876953125, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8465914130210876, "num_tokens": 163913499.0, "step": 4293 }, { "epoch": 0.5462409362676505, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.009218215942383, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8336108922958374, "num_tokens": 163957965.0, "step": 4294 }, { "epoch": 0.5463681465462409, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 28.180362701416016, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8121861219406128, "num_tokens": 164005661.0, "step": 4295 }, { "epoch": 0.5464953568248314, "ewc_loss": 0.064453125, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.1021575927734375e-05, "grad_norm": 27.670690536499023, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8692247867584229, "num_tokens": 164044717.0, "step": 4296 }, { "epoch": 0.5466225671034219, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.477127075195312, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.852558970451355, "num_tokens": 164080022.0, "step": 4297 }, { "epoch": 0.5467497773820125, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.13096809387207, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8604379892349243, "num_tokens": 164112749.0, "step": 4298 }, { "epoch": 0.546876987660603, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.23151397705078, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8486043810844421, "num_tokens": 164152517.0, "step": 4299 }, { "epoch": 0.5470041979391935, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.89906883239746, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8593916893005371, "num_tokens": 164195245.0, "step": 4300 }, { "epoch": 0.5471314082177839, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 27.703365325927734, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8470147252082825, "num_tokens": 164230643.0, "step": 4301 }, { "epoch": 0.5472586184963745, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.82233428955078, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8500049710273743, "num_tokens": 164271089.0, "step": 4302 }, { "epoch": 0.547385828774965, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.164901733398438, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8512018918991089, "num_tokens": 164310618.0, "step": 4303 }, { "epoch": 0.5475130390535555, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.402559280395508, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8397918939590454, "num_tokens": 164348630.0, "step": 4304 }, { "epoch": 0.5476402493321461, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.02128028869629, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8588404655456543, "num_tokens": 164383735.0, "step": 4305 }, { "epoch": 0.5477674596107366, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.360063552856445, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8539192080497742, "num_tokens": 164426115.0, "step": 4306 }, { "epoch": 0.5478946698893271, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 27.816497802734375, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8464231491088867, "num_tokens": 164462119.0, "step": 4307 }, { "epoch": 0.5480218801679175, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.62175941467285, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8505065441131592, "num_tokens": 164503030.0, "step": 4308 }, { "epoch": 0.5481490904465081, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3589859008789062e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 27.759052276611328, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8551351428031921, "num_tokens": 164541091.0, "step": 4309 }, { "epoch": 0.5482763007250986, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.387176513671875, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8482542634010315, "num_tokens": 164577678.0, "step": 4310 }, { "epoch": 0.5484035110036891, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.278749465942383, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8502999544143677, "num_tokens": 164612779.0, "step": 4311 }, { "epoch": 0.5485307212822796, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.101398468017578, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8586635589599609, "num_tokens": 164649849.0, "step": 4312 }, { "epoch": 0.5486579315608702, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.818960189819336, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8507727980613708, "num_tokens": 164690538.0, "step": 4313 }, { "epoch": 0.5487851418394606, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 27.757740020751953, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8532942533493042, "num_tokens": 164729338.0, "step": 4314 }, { "epoch": 0.5489123521180511, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 29.158594131469727, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8653849959373474, "num_tokens": 164766648.0, "step": 4315 }, { "epoch": 0.5490395623966416, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 27.890615463256836, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8519871234893799, "num_tokens": 164804254.0, "step": 4316 }, { "epoch": 0.5491667726752322, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.382476806640625, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8669241666793823, "num_tokens": 164839943.0, "step": 4317 }, { "epoch": 0.5492939829538227, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.137439727783203, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8446989059448242, "num_tokens": 164887057.0, "step": 4318 }, { "epoch": 0.5494211932324132, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.924100875854492, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.853200376033783, "num_tokens": 164928515.0, "step": 4319 }, { "epoch": 0.5495484035110036, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.14984130859375e-05, "grad_norm": 28.365467071533203, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8522071838378906, "num_tokens": 164963818.0, "step": 4320 }, { "epoch": 0.5496756137895942, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 27.97065544128418, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8676074743270874, "num_tokens": 165001091.0, "step": 4321 }, { "epoch": 0.5498028240681847, "ewc_loss": 0.06494140625, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.125999450683594e-05, "grad_norm": 28.016067504882812, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.836035966873169, "num_tokens": 165046009.0, "step": 4322 }, { "epoch": 0.5499300343467752, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.070402145385742, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8493474721908569, "num_tokens": 165090729.0, "step": 4323 }, { "epoch": 0.5500572446253658, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.808013916015625, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8688993453979492, "num_tokens": 165129771.0, "step": 4324 }, { "epoch": 0.5501844549039563, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.073406219482422, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.869728147983551, "num_tokens": 165164310.0, "step": 4325 }, { "epoch": 0.5503116651825467, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.011598587036133, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8505111932754517, "num_tokens": 165202842.0, "step": 4326 }, { "epoch": 0.5504388754611372, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 27.925395965576172, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8577836751937866, "num_tokens": 165235998.0, "step": 4327 }, { "epoch": 0.5505660857397278, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.01607894897461, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8544739484786987, "num_tokens": 165271779.0, "step": 4328 }, { "epoch": 0.5506932960183183, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.137027740478516, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8467109799385071, "num_tokens": 165306136.0, "step": 4329 }, { "epoch": 0.5508205062969088, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.95490837097168, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8669790625572205, "num_tokens": 165342401.0, "step": 4330 }, { "epoch": 0.5509477165754993, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.961299896240234, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8578809499740601, "num_tokens": 165380120.0, "step": 4331 }, { "epoch": 0.5510749268540898, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.076208114624023, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8412479162216187, "num_tokens": 165418210.0, "step": 4332 }, { "epoch": 0.5512021371326803, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.959022521972656, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.833992063999176, "num_tokens": 165460952.0, "step": 4333 }, { "epoch": 0.5513293474112708, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.14847755432129, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8671575784683228, "num_tokens": 165500370.0, "step": 4334 }, { "epoch": 0.5514565576898613, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.395252227783203, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8540686368942261, "num_tokens": 165534089.0, "step": 4335 }, { "epoch": 0.5515837679684519, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.079639434814453, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8458439111709595, "num_tokens": 165573111.0, "step": 4336 }, { "epoch": 0.5517109782470424, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.079164505004883, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8435068130493164, "num_tokens": 165611942.0, "step": 4337 }, { "epoch": 0.5518381885256328, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.123899459838867, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8607742786407471, "num_tokens": 165651703.0, "step": 4338 }, { "epoch": 0.5519653988042234, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 27.946929931640625, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8513388633728027, "num_tokens": 165691257.0, "step": 4339 }, { "epoch": 0.5520926090828139, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.075056076049805, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8696714639663696, "num_tokens": 165727947.0, "step": 4340 }, { "epoch": 0.5522198193614044, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.954818725585938, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8634977340698242, "num_tokens": 165763400.0, "step": 4341 }, { "epoch": 0.5523470296399949, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.234031677246094, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8539187908172607, "num_tokens": 165809586.0, "step": 4342 }, { "epoch": 0.5524742399185855, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 27.78326416015625, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8551732301712036, "num_tokens": 165839565.0, "step": 4343 }, { "epoch": 0.5526014501971759, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.401596069335938, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8560088872909546, "num_tokens": 165873539.0, "step": 4344 }, { "epoch": 0.5527286604757664, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.833959579467773, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8620189428329468, "num_tokens": 165920987.0, "step": 4345 }, { "epoch": 0.5528558707543569, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.352602005004883, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8433964252471924, "num_tokens": 165965661.0, "step": 4346 }, { "epoch": 0.5529830810329475, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.78638458251953, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8485348224639893, "num_tokens": 165998239.0, "step": 4347 }, { "epoch": 0.553110291311538, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.409345626831055, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.851355791091919, "num_tokens": 166036489.0, "step": 4348 }, { "epoch": 0.5532375015901285, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.959020614624023, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8506854772567749, "num_tokens": 166078774.0, "step": 4349 }, { "epoch": 0.5533647118687189, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.32040023803711, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.85982346534729, "num_tokens": 166119555.0, "step": 4350 }, { "epoch": 0.5534919221473095, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.856491088867188, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8523339629173279, "num_tokens": 166159788.0, "step": 4351 }, { "epoch": 0.5536191324259, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.343042373657227, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8480911254882812, "num_tokens": 166200230.0, "step": 4352 }, { "epoch": 0.5537463427044905, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.943641662597656, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8573768138885498, "num_tokens": 166237864.0, "step": 4353 }, { "epoch": 0.553873552983081, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.318567276000977, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8560609221458435, "num_tokens": 166278347.0, "step": 4354 }, { "epoch": 0.5540007632616716, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.61069107055664, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8451145887374878, "num_tokens": 166315744.0, "step": 4355 }, { "epoch": 0.554127973540262, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.606311798095703, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8543111085891724, "num_tokens": 166352705.0, "step": 4356 }, { "epoch": 0.5542551838188525, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 27.629343032836914, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8573021292686462, "num_tokens": 166393956.0, "step": 4357 }, { "epoch": 0.554382394097443, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.305435180664062, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8316695094108582, "num_tokens": 166430172.0, "step": 4358 }, { "epoch": 0.5545096043760336, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 27.690649032592773, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8531889319419861, "num_tokens": 166468732.0, "step": 4359 }, { "epoch": 0.5546368146546241, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.16434669494629, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8496885895729065, "num_tokens": 166511383.0, "step": 4360 }, { "epoch": 0.5547640249332146, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.025367736816406, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8574144244194031, "num_tokens": 166547674.0, "step": 4361 }, { "epoch": 0.5548912352118052, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 27.966453552246094, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8465559482574463, "num_tokens": 166581763.0, "step": 4362 }, { "epoch": 0.5550184454903956, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.048112869262695, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8490666151046753, "num_tokens": 166618604.0, "step": 4363 }, { "epoch": 0.5551456557689861, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 27.732425689697266, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8635488748550415, "num_tokens": 166654812.0, "step": 4364 }, { "epoch": 0.5552728660475766, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.2996826171875, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8408386707305908, "num_tokens": 166693206.0, "step": 4365 }, { "epoch": 0.5554000763261672, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 27.729888916015625, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8504717350006104, "num_tokens": 166734531.0, "step": 4366 }, { "epoch": 0.5555272866047577, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.38546371459961, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8521767854690552, "num_tokens": 166771362.0, "step": 4367 }, { "epoch": 0.5556544968833482, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.070270538330078, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8455228805541992, "num_tokens": 166802784.0, "step": 4368 }, { "epoch": 0.5557817071619386, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.084224700927734, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8490352034568787, "num_tokens": 166835590.0, "step": 4369 }, { "epoch": 0.5559089174405292, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.92148780822754, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8478156924247742, "num_tokens": 166872145.0, "step": 4370 }, { "epoch": 0.5560361277191197, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.27048110961914, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8492255210876465, "num_tokens": 166912855.0, "step": 4371 }, { "epoch": 0.5561633379977102, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.934371948242188, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8381216526031494, "num_tokens": 166949482.0, "step": 4372 }, { "epoch": 0.5562905482763008, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.21180534362793, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.852314829826355, "num_tokens": 166986727.0, "step": 4373 }, { "epoch": 0.5564177585548913, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.086301803588867, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8450844287872314, "num_tokens": 167025869.0, "step": 4374 }, { "epoch": 0.5565449688334817, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 27.800119400024414, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8642178177833557, "num_tokens": 167059825.0, "step": 4375 }, { "epoch": 0.5566721791120722, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.22871971130371, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8579952120780945, "num_tokens": 167097375.0, "step": 4376 }, { "epoch": 0.5567993893906628, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 27.86161994934082, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8440463542938232, "num_tokens": 167136555.0, "step": 4377 }, { "epoch": 0.5569265996692533, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.115068435668945, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8511855602264404, "num_tokens": 167179870.0, "step": 4378 }, { "epoch": 0.5570538099478438, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.366085052490234, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8512547612190247, "num_tokens": 167215242.0, "step": 4379 }, { "epoch": 0.5571810202264343, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.259727478027344, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8613308668136597, "num_tokens": 167250690.0, "step": 4380 }, { "epoch": 0.5573082305050248, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.46590232849121, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8394131660461426, "num_tokens": 167283677.0, "step": 4381 }, { "epoch": 0.5574354407836153, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.26082420349121, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8542795181274414, "num_tokens": 167321355.0, "step": 4382 }, { "epoch": 0.5575626510622058, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.37739372253418, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8455776572227478, "num_tokens": 167363103.0, "step": 4383 }, { "epoch": 0.5576898613407963, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.17348861694336, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8668584823608398, "num_tokens": 167399732.0, "step": 4384 }, { "epoch": 0.5578170716193869, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.017560958862305, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8516595959663391, "num_tokens": 167434938.0, "step": 4385 }, { "epoch": 0.5579442818979774, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.54276466369629, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8561996221542358, "num_tokens": 167472293.0, "step": 4386 }, { "epoch": 0.5580714921765678, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 27.991899490356445, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8569804430007935, "num_tokens": 167511015.0, "step": 4387 }, { "epoch": 0.5581987024551583, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.621864318847656, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8534403443336487, "num_tokens": 167549777.0, "step": 4388 }, { "epoch": 0.5583259127337489, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.293020248413086, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8581124544143677, "num_tokens": 167592342.0, "step": 4389 }, { "epoch": 0.5584531230123394, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.372835159301758, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8565925359725952, "num_tokens": 167630829.0, "step": 4390 }, { "epoch": 0.5585803332909299, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.31183433532715, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8410860896110535, "num_tokens": 167675938.0, "step": 4391 }, { "epoch": 0.5587075435695205, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.465946197509766, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8566255569458008, "num_tokens": 167716550.0, "step": 4392 }, { "epoch": 0.5588347538481109, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.409828186035156, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.840191125869751, "num_tokens": 167754401.0, "step": 4393 }, { "epoch": 0.5589619641267014, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.797183990478516, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8567579388618469, "num_tokens": 167792124.0, "step": 4394 }, { "epoch": 0.5590891744052919, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.372793197631836, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8722072243690491, "num_tokens": 167829226.0, "step": 4395 }, { "epoch": 0.5592163846838825, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.502649307250977, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8623455762863159, "num_tokens": 167863439.0, "step": 4396 }, { "epoch": 0.559343594962473, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.307151794433594, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8540424108505249, "num_tokens": 167907019.0, "step": 4397 }, { "epoch": 0.5594708052410635, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.525094985961914, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8455976843833923, "num_tokens": 167942539.0, "step": 4398 }, { "epoch": 0.5595980155196539, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.690898895263672, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8527710437774658, "num_tokens": 167982848.0, "step": 4399 }, { "epoch": 0.5597252257982445, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.098167419433594, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8617483377456665, "num_tokens": 168017500.0, "step": 4400 }, { "epoch": 0.559852436076835, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.693675994873047, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8375303745269775, "num_tokens": 168052790.0, "step": 4401 }, { "epoch": 0.5599796463554255, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.33504295349121, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8597233891487122, "num_tokens": 168082541.0, "step": 4402 }, { "epoch": 0.560106856634016, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.68865966796875, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8597949743270874, "num_tokens": 168128218.0, "step": 4403 }, { "epoch": 0.5602340669126066, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.656085968017578, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8557233810424805, "num_tokens": 168165496.0, "step": 4404 }, { "epoch": 0.560361277191197, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.236225128173828, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8414582014083862, "num_tokens": 168204675.0, "step": 4405 }, { "epoch": 0.5604884874697875, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.54952621459961, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8488715887069702, "num_tokens": 168244832.0, "step": 4406 }, { "epoch": 0.560615697748378, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.481693267822266, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8488287925720215, "num_tokens": 168282440.0, "step": 4407 }, { "epoch": 0.5607429080269686, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.344989776611328, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8589462637901306, "num_tokens": 168325303.0, "step": 4408 }, { "epoch": 0.5608701183055591, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.73372459411621, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8592500686645508, "num_tokens": 168360628.0, "step": 4409 }, { "epoch": 0.5609973285841496, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.4617862701416, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.84024977684021, "num_tokens": 168398601.0, "step": 4410 }, { "epoch": 0.5611245388627402, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.878759384155273, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8390084505081177, "num_tokens": 168436893.0, "step": 4411 }, { "epoch": 0.5612517491413306, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.298664093017578, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8487082123756409, "num_tokens": 168478794.0, "step": 4412 }, { "epoch": 0.5613789594199211, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.93025016784668, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8508862853050232, "num_tokens": 168520184.0, "step": 4413 }, { "epoch": 0.5615061696985116, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.315507888793945, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8496720790863037, "num_tokens": 168554305.0, "step": 4414 }, { "epoch": 0.5616333799771022, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.81454086303711, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8365049362182617, "num_tokens": 168597752.0, "step": 4415 }, { "epoch": 0.5617605902556927, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.541156768798828, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8591784238815308, "num_tokens": 168637764.0, "step": 4416 }, { "epoch": 0.5618878005342832, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.8362979888916, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8327917456626892, "num_tokens": 168672260.0, "step": 4417 }, { "epoch": 0.5620150108128736, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.621618270874023, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8612778782844543, "num_tokens": 168711478.0, "step": 4418 }, { "epoch": 0.5621422210914642, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 29.083721160888672, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8560818433761597, "num_tokens": 168751773.0, "step": 4419 }, { "epoch": 0.5622694313700547, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.154685974121094, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.865363359451294, "num_tokens": 168786364.0, "step": 4420 }, { "epoch": 0.5623966416486452, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.80469512939453, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8513104915618896, "num_tokens": 168826819.0, "step": 4421 }, { "epoch": 0.5625238519272358, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.285654067993164, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8504244089126587, "num_tokens": 168869405.0, "step": 4422 }, { "epoch": 0.5626510622058263, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 29.03334617614746, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8597034215927124, "num_tokens": 168904323.0, "step": 4423 }, { "epoch": 0.5627782724844167, "ewc_loss": 0.0654296875, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.199464797973633, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8431148529052734, "num_tokens": 168948654.0, "step": 4424 }, { "epoch": 0.5629054827630072, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.74587631225586, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8597443699836731, "num_tokens": 168984333.0, "step": 4425 }, { "epoch": 0.5630326930415978, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.38036346435547, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8519929647445679, "num_tokens": 169019843.0, "step": 4426 }, { "epoch": 0.5631599033201883, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.763891220092773, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8635662794113159, "num_tokens": 169058177.0, "step": 4427 }, { "epoch": 0.5632871135987788, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.258291244506836, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8587859272956848, "num_tokens": 169102268.0, "step": 4428 }, { "epoch": 0.5634143238773693, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.890975952148438, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8481536507606506, "num_tokens": 169141709.0, "step": 4429 }, { "epoch": 0.5635415341559598, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.306049346923828, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.849143385887146, "num_tokens": 169177107.0, "step": 4430 }, { "epoch": 0.5636687444345503, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.6103572845459, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.849718451499939, "num_tokens": 169211531.0, "step": 4431 }, { "epoch": 0.5637959547131408, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.58939552307129, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8451128602027893, "num_tokens": 169247507.0, "step": 4432 }, { "epoch": 0.5639231649917313, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.57254981994629, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8666742444038391, "num_tokens": 169281007.0, "step": 4433 }, { "epoch": 0.5640503752703219, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.826688766479492, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8546460866928101, "num_tokens": 169322295.0, "step": 4434 }, { "epoch": 0.5641775855489124, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.554765701293945, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8497257232666016, "num_tokens": 169358184.0, "step": 4435 }, { "epoch": 0.5643047958275028, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.907838821411133, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8573588132858276, "num_tokens": 169391518.0, "step": 4436 }, { "epoch": 0.5644320061060933, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.140623092651367, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8523563146591187, "num_tokens": 169426743.0, "step": 4437 }, { "epoch": 0.5645592163846839, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.93198013305664, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8723657727241516, "num_tokens": 169466452.0, "step": 4438 }, { "epoch": 0.5646864266632744, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.00015640258789, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8592092990875244, "num_tokens": 169500295.0, "step": 4439 }, { "epoch": 0.5648136369418649, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.158288955688477, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8531178832054138, "num_tokens": 169541829.0, "step": 4440 }, { "epoch": 0.5649408472204555, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.143211364746094, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8378280997276306, "num_tokens": 169581547.0, "step": 4441 }, { "epoch": 0.5650680574990459, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.361154556274414, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8587253093719482, "num_tokens": 169622107.0, "step": 4442 }, { "epoch": 0.5651952677776364, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.87288475036621, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8510853052139282, "num_tokens": 169657258.0, "step": 4443 }, { "epoch": 0.5653224780562269, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.425796508789062, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8418479561805725, "num_tokens": 169691568.0, "step": 4444 }, { "epoch": 0.5654496883348175, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.168901443481445, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8600807189941406, "num_tokens": 169732429.0, "step": 4445 }, { "epoch": 0.565576898613408, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.769506454467773, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8459750413894653, "num_tokens": 169769224.0, "step": 4446 }, { "epoch": 0.5657041088919985, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.62784767150879, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8517838716506958, "num_tokens": 169812830.0, "step": 4447 }, { "epoch": 0.5658313191705889, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.076019287109375, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.849348783493042, "num_tokens": 169843823.0, "step": 4448 }, { "epoch": 0.5659585294491795, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.537494659423828, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8561357259750366, "num_tokens": 169882847.0, "step": 4449 }, { "epoch": 0.56608573972777, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.758251190185547, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8350057005882263, "num_tokens": 169921264.0, "step": 4450 }, { "epoch": 0.5662129500063605, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.013208389282227, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8582055568695068, "num_tokens": 169949681.0, "step": 4451 }, { "epoch": 0.566340160284951, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.30730628967285, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8673590421676636, "num_tokens": 169986254.0, "step": 4452 }, { "epoch": 0.5664673705635416, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.33942222595215, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.849356472492218, "num_tokens": 170022745.0, "step": 4453 }, { "epoch": 0.566594580842132, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.46735954284668, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.863853931427002, "num_tokens": 170057038.0, "step": 4454 }, { "epoch": 0.5667217911207225, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.925800323486328, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8719998598098755, "num_tokens": 170098460.0, "step": 4455 }, { "epoch": 0.566849001399313, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.620386123657227, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8408712148666382, "num_tokens": 170137689.0, "step": 4456 }, { "epoch": 0.5669762116779036, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.602680206298828, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8537477254867554, "num_tokens": 170176716.0, "step": 4457 }, { "epoch": 0.5671034219564941, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.173683166503906e-05, "grad_norm": 28.43950843811035, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8483773469924927, "num_tokens": 170212621.0, "step": 4458 }, { "epoch": 0.5672306322350846, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.953340530395508, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8437954187393188, "num_tokens": 170254493.0, "step": 4459 }, { "epoch": 0.5673578425136752, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.555143356323242, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.874723494052887, "num_tokens": 170290559.0, "step": 4460 }, { "epoch": 0.5674850527922656, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.212533950805664, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8485952615737915, "num_tokens": 170335079.0, "step": 4461 }, { "epoch": 0.5676122630708561, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.960651397705078, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.856364369392395, "num_tokens": 170370665.0, "step": 4462 }, { "epoch": 0.5677394733494466, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.570873260498047, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8523918390274048, "num_tokens": 170412585.0, "step": 4463 }, { "epoch": 0.5678666836280372, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.591880798339844, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8586236238479614, "num_tokens": 170450806.0, "step": 4464 }, { "epoch": 0.5679938939066277, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.940990447998047, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8529555201530457, "num_tokens": 170483265.0, "step": 4465 }, { "epoch": 0.5681211041852182, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.98590850830078, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8701349496841431, "num_tokens": 170520910.0, "step": 4466 }, { "epoch": 0.5682483144638086, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.69103240966797, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8353086709976196, "num_tokens": 170556516.0, "step": 4467 }, { "epoch": 0.5683755247423992, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.600234985351562, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8522105813026428, "num_tokens": 170595571.0, "step": 4468 }, { "epoch": 0.5685027350209897, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.807031631469727, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8733081817626953, "num_tokens": 170634117.0, "step": 4469 }, { "epoch": 0.5686299452995802, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.48111343383789, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8490604758262634, "num_tokens": 170677624.0, "step": 4470 }, { "epoch": 0.5687571555781707, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.748931884765625, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8392951488494873, "num_tokens": 170716732.0, "step": 4471 }, { "epoch": 0.5688843658567613, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.683063507080078, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8467046022415161, "num_tokens": 170758521.0, "step": 4472 }, { "epoch": 0.5690115761353517, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.93447494506836, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.853586733341217, "num_tokens": 170796799.0, "step": 4473 }, { "epoch": 0.5691387864139422, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.772846221923828, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8399434089660645, "num_tokens": 170838569.0, "step": 4474 }, { "epoch": 0.5692659966925327, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.537996292114258, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8558813333511353, "num_tokens": 170878206.0, "step": 4475 }, { "epoch": 0.5693932069711233, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.742250442504883, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8393105864524841, "num_tokens": 170915506.0, "step": 4476 }, { "epoch": 0.5695204172497138, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.27963638305664, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8487802743911743, "num_tokens": 170952303.0, "step": 4477 }, { "epoch": 0.5696476275283043, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.090206146240234, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8450796604156494, "num_tokens": 170993757.0, "step": 4478 }, { "epoch": 0.5697748378068948, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.542919158935547, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8513556718826294, "num_tokens": 171030240.0, "step": 4479 }, { "epoch": 0.5699020480854853, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.84189224243164, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8296024799346924, "num_tokens": 171069654.0, "step": 4480 }, { "epoch": 0.5700292583640758, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.533830642700195, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8370939493179321, "num_tokens": 171107341.0, "step": 4481 }, { "epoch": 0.5701564686426663, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.168107986450195, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8440805673599243, "num_tokens": 171144063.0, "step": 4482 }, { "epoch": 0.5702836789212569, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.612916946411133, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8447837233543396, "num_tokens": 171180448.0, "step": 4483 }, { "epoch": 0.5704108891998474, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.085054397583008, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8577312231063843, "num_tokens": 171214220.0, "step": 4484 }, { "epoch": 0.5705380994784378, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.489612579345703, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8593361973762512, "num_tokens": 171252798.0, "step": 4485 }, { "epoch": 0.5706653097570283, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.045570373535156, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8585739135742188, "num_tokens": 171295434.0, "step": 4486 }, { "epoch": 0.5707925200356189, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.90165901184082, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8551903963088989, "num_tokens": 171334129.0, "step": 4487 }, { "epoch": 0.5709197303142094, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.77577018737793, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8497875928878784, "num_tokens": 171370239.0, "step": 4488 }, { "epoch": 0.5710469405927999, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.56796646118164, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8671666979789734, "num_tokens": 171402411.0, "step": 4489 }, { "epoch": 0.5711741508713905, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.401296615600586, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8628832697868347, "num_tokens": 171438736.0, "step": 4490 }, { "epoch": 0.5713013611499809, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.53130531311035, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8589938879013062, "num_tokens": 171477949.0, "step": 4491 }, { "epoch": 0.5714285714285714, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.076732635498047, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8477739691734314, "num_tokens": 171520953.0, "step": 4492 }, { "epoch": 0.5715557817071619, "ewc_loss": 0.06591796875, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.1975250244140625e-05, "grad_norm": 28.833568572998047, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8585046529769897, "num_tokens": 171559415.0, "step": 4493 }, { "epoch": 0.5716829919857525, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.222332000732422, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8607670068740845, "num_tokens": 171593118.0, "step": 4494 }, { "epoch": 0.571810202264343, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.863182067871094, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8399927616119385, "num_tokens": 171634514.0, "step": 4495 }, { "epoch": 0.5719374125429335, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.11199378967285, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8567607402801514, "num_tokens": 171675239.0, "step": 4496 }, { "epoch": 0.5720646228215239, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.791292190551758, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8623877167701721, "num_tokens": 171713154.0, "step": 4497 }, { "epoch": 0.5721918331001145, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.749401092529297, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8453759551048279, "num_tokens": 171750046.0, "step": 4498 }, { "epoch": 0.572319043378705, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.980125427246094, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8431947827339172, "num_tokens": 171782708.0, "step": 4499 }, { "epoch": 0.5724462536572955, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.70851707458496, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8487241268157959, "num_tokens": 171819421.0, "step": 4500 }, { "epoch": 0.572573463935886, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.00720977783203, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8518548607826233, "num_tokens": 171860993.0, "step": 4501 }, { "epoch": 0.5727006742144766, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.1678524017334, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8513012528419495, "num_tokens": 171896894.0, "step": 4502 }, { "epoch": 0.572827884493067, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.064937591552734, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8501724600791931, "num_tokens": 171938694.0, "step": 4503 }, { "epoch": 0.5729550947716575, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 29.02040672302246, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8773636817932129, "num_tokens": 171972606.0, "step": 4504 }, { "epoch": 0.573082305050248, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.815963745117188, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8549744486808777, "num_tokens": 172010861.0, "step": 4505 }, { "epoch": 0.5732095153288386, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.27393341064453, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8378133773803711, "num_tokens": 172049867.0, "step": 4506 }, { "epoch": 0.5733367256074291, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.136850357055664, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8538845777511597, "num_tokens": 172092176.0, "step": 4507 }, { "epoch": 0.5734639358860196, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.96409034729004, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8530428409576416, "num_tokens": 172132789.0, "step": 4508 }, { "epoch": 0.5735911461646102, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 29.019498825073242, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8675989508628845, "num_tokens": 172170536.0, "step": 4509 }, { "epoch": 0.5737183564432006, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.348011016845703, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8524074554443359, "num_tokens": 172206828.0, "step": 4510 }, { "epoch": 0.5738455667217911, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.9632625579834, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8445600271224976, "num_tokens": 172242311.0, "step": 4511 }, { "epoch": 0.5739727770003816, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.357162475585938, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.848557710647583, "num_tokens": 172275135.0, "step": 4512 }, { "epoch": 0.5740999872789722, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.78642463684082, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8616446256637573, "num_tokens": 172308682.0, "step": 4513 }, { "epoch": 0.5742271975575627, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.145898818969727, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8479393720626831, "num_tokens": 172345834.0, "step": 4514 }, { "epoch": 0.5743544078361532, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.85342025756836, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8301409482955933, "num_tokens": 172387976.0, "step": 4515 }, { "epoch": 0.5744816181147436, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.376819610595703, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8523838520050049, "num_tokens": 172427447.0, "step": 4516 }, { "epoch": 0.5746088283933342, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.649927139282227, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8438454866409302, "num_tokens": 172468686.0, "step": 4517 }, { "epoch": 0.5747360386719247, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.426403045654297, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8495261073112488, "num_tokens": 172508190.0, "step": 4518 }, { "epoch": 0.5748632489505152, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 29.013608932495117, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8682878017425537, "num_tokens": 172552295.0, "step": 4519 }, { "epoch": 0.5749904592291057, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.404232025146484, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8579214215278625, "num_tokens": 172591065.0, "step": 4520 }, { "epoch": 0.5751176695076963, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.221366882324219e-05, "grad_norm": 28.94561195373535, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8555654287338257, "num_tokens": 172633741.0, "step": 4521 }, { "epoch": 0.5752448797862867, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.11451530456543, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.862096905708313, "num_tokens": 172669081.0, "step": 4522 }, { "epoch": 0.5753720900648772, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.083946228027344, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8311961889266968, "num_tokens": 172704956.0, "step": 4523 }, { "epoch": 0.5754993003434677, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.92417335510254, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8560115098953247, "num_tokens": 172742270.0, "step": 4524 }, { "epoch": 0.5756265106220583, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.21868324279785, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.840890645980835, "num_tokens": 172778065.0, "step": 4525 }, { "epoch": 0.5757537209006488, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.90350341796875, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8610261678695679, "num_tokens": 172815825.0, "step": 4526 }, { "epoch": 0.5758809311792393, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.055795669555664, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8528584241867065, "num_tokens": 172849560.0, "step": 4527 }, { "epoch": 0.5760081414578297, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.92459487915039, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.853721022605896, "num_tokens": 172885203.0, "step": 4528 }, { "epoch": 0.5761353517364203, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.953462600708008, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8514188528060913, "num_tokens": 172922341.0, "step": 4529 }, { "epoch": 0.5762625620150108, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.424732208251953, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.857583224773407, "num_tokens": 172953363.0, "step": 4530 }, { "epoch": 0.5763897722936013, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.73949432373047, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8585459589958191, "num_tokens": 172986253.0, "step": 4531 }, { "epoch": 0.5765169825721919, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.62668800354004, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8633575439453125, "num_tokens": 173023466.0, "step": 4532 }, { "epoch": 0.5766441928507824, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 28.998554229736328, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8475056886672974, "num_tokens": 173063329.0, "step": 4533 }, { "epoch": 0.5767714031293728, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.031431198120117, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8603448271751404, "num_tokens": 173100676.0, "step": 4534 }, { "epoch": 0.5768986134079633, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.195512771606445, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8638744354248047, "num_tokens": 173135088.0, "step": 4535 }, { "epoch": 0.5770258236865539, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.265201568603516, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8442777991294861, "num_tokens": 173175810.0, "step": 4536 }, { "epoch": 0.5771530339651444, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.79924774169922, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8532348275184631, "num_tokens": 173220826.0, "step": 4537 }, { "epoch": 0.5772802442437349, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.571685791015625, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8640626668930054, "num_tokens": 173258411.0, "step": 4538 }, { "epoch": 0.5774074545223254, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.05231285095215, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8366672992706299, "num_tokens": 173297298.0, "step": 4539 }, { "epoch": 0.5775346648009159, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.074966430664062, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8349527716636658, "num_tokens": 173330839.0, "step": 4540 }, { "epoch": 0.5776618750795064, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.12594223022461, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8453298807144165, "num_tokens": 173370219.0, "step": 4541 }, { "epoch": 0.5777890853580969, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.299903869628906, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8548800945281982, "num_tokens": 173410425.0, "step": 4542 }, { "epoch": 0.5779162956366874, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.81928825378418, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8524184226989746, "num_tokens": 173447109.0, "step": 4543 }, { "epoch": 0.578043505915278, "ewc_loss": 0.06640625, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.188371658325195, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8584624528884888, "num_tokens": 173490064.0, "step": 4544 }, { "epoch": 0.5781707161938685, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.179737091064453, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8559022545814514, "num_tokens": 173523791.0, "step": 4545 }, { "epoch": 0.5782979264724589, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.81753921508789, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.861449122428894, "num_tokens": 173564069.0, "step": 4546 }, { "epoch": 0.5784251367510495, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.23501968383789, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8512650728225708, "num_tokens": 173601893.0, "step": 4547 }, { "epoch": 0.57855234702964, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.11983299255371, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8632928133010864, "num_tokens": 173642299.0, "step": 4548 }, { "epoch": 0.5786795573082305, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.798702239990234, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.855851411819458, "num_tokens": 173675824.0, "step": 4549 }, { "epoch": 0.578806767586821, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.922605514526367, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8553604483604431, "num_tokens": 173712192.0, "step": 4550 }, { "epoch": 0.5789339778654116, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.178319931030273, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8602880239486694, "num_tokens": 173746949.0, "step": 4551 }, { "epoch": 0.579061188144002, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.79469108581543, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8549206256866455, "num_tokens": 173784527.0, "step": 4552 }, { "epoch": 0.5791883984225925, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.896827697753906, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8594574928283691, "num_tokens": 173820648.0, "step": 4553 }, { "epoch": 0.579315608701183, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.81703758239746, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8463051319122314, "num_tokens": 173860427.0, "step": 4554 }, { "epoch": 0.5794428189797736, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.920581817626953, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.855131983757019, "num_tokens": 173899876.0, "step": 4555 }, { "epoch": 0.5795700292583641, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.70696258544922, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8620312213897705, "num_tokens": 173937354.0, "step": 4556 }, { "epoch": 0.5796972395369546, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.04054832458496, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8422145843505859, "num_tokens": 173976039.0, "step": 4557 }, { "epoch": 0.5798244498155452, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.96864891052246, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8338776230812073, "num_tokens": 174006359.0, "step": 4558 }, { "epoch": 0.5799516600941356, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.908626556396484, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8446980714797974, "num_tokens": 174044662.0, "step": 4559 }, { "epoch": 0.5800788703727261, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.837926864624023, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8489633798599243, "num_tokens": 174077133.0, "step": 4560 }, { "epoch": 0.5802060806513166, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.102245330810547, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8483150005340576, "num_tokens": 174121873.0, "step": 4561 }, { "epoch": 0.5803332909299072, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.80150032043457, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8563187122344971, "num_tokens": 174158873.0, "step": 4562 }, { "epoch": 0.5804605012084977, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.714143753051758, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8605215549468994, "num_tokens": 174189222.0, "step": 4563 }, { "epoch": 0.5805877114870882, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.040987014770508, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8449892401695251, "num_tokens": 174229018.0, "step": 4564 }, { "epoch": 0.5807149217656786, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.744794845581055, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8534654378890991, "num_tokens": 174269215.0, "step": 4565 }, { "epoch": 0.5808421320442692, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.928050994873047, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8623183965682983, "num_tokens": 174303926.0, "step": 4566 }, { "epoch": 0.5809693423228597, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.607139587402344, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8560677766799927, "num_tokens": 174347491.0, "step": 4567 }, { "epoch": 0.5810965526014502, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.04758644104004, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8541867733001709, "num_tokens": 174384246.0, "step": 4568 }, { "epoch": 0.5812237628800407, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.53165054321289, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8539329767227173, "num_tokens": 174417081.0, "step": 4569 }, { "epoch": 0.5813509731586313, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.14996337890625, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8433729410171509, "num_tokens": 174457364.0, "step": 4570 }, { "epoch": 0.5814781834372217, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.572723388671875, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.853765070438385, "num_tokens": 174494304.0, "step": 4571 }, { "epoch": 0.5816053937158122, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.145580291748047, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8557673692703247, "num_tokens": 174528047.0, "step": 4572 }, { "epoch": 0.5817326039944027, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 28.627761840820312, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8577703833580017, "num_tokens": 174565990.0, "step": 4573 }, { "epoch": 0.5818598142729933, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 28.749279022216797, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8381158113479614, "num_tokens": 174599039.0, "step": 4574 }, { "epoch": 0.5819870245515838, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.021831512451172, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8470460772514343, "num_tokens": 174634786.0, "step": 4575 }, { "epoch": 0.5821142348301743, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.749906539916992, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8421977758407593, "num_tokens": 174677875.0, "step": 4576 }, { "epoch": 0.5822414451087647, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 28.848974227905273, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8478410840034485, "num_tokens": 174715421.0, "step": 4577 }, { "epoch": 0.5823686553873553, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.033201217651367, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8507492542266846, "num_tokens": 174746555.0, "step": 4578 }, { "epoch": 0.5824958656659458, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.88612174987793, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8419000506401062, "num_tokens": 174786470.0, "step": 4579 }, { "epoch": 0.5826230759445363, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.819293975830078, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8503668308258057, "num_tokens": 174827131.0, "step": 4580 }, { "epoch": 0.5827502862231269, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.04749298095703, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8455551862716675, "num_tokens": 174861391.0, "step": 4581 }, { "epoch": 0.5828774965017174, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.056060791015625, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8576655983924866, "num_tokens": 174897437.0, "step": 4582 }, { "epoch": 0.5830047067803078, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.317096710205078, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8435659408569336, "num_tokens": 174932504.0, "step": 4583 }, { "epoch": 0.5831319170588983, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.092857360839844, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8480604887008667, "num_tokens": 174974746.0, "step": 4584 }, { "epoch": 0.5832591273374889, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.98423194885254, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.847241997718811, "num_tokens": 175016155.0, "step": 4585 }, { "epoch": 0.5833863376160794, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.737476348876953, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8382586240768433, "num_tokens": 175060595.0, "step": 4586 }, { "epoch": 0.5835135478946699, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.575815200805664, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8526402115821838, "num_tokens": 175101278.0, "step": 4587 }, { "epoch": 0.5836407581732604, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.245208740234375e-05, "grad_norm": 29.068395614624023, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8455709218978882, "num_tokens": 175139479.0, "step": 4588 }, { "epoch": 0.5837679684518509, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.597854614257812, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8357123136520386, "num_tokens": 175182376.0, "step": 4589 }, { "epoch": 0.5838951787304414, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 28.87041473388672, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8455591201782227, "num_tokens": 175225449.0, "step": 4590 }, { "epoch": 0.5840223890090319, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.359094619750977, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8504791259765625, "num_tokens": 175255959.0, "step": 4591 }, { "epoch": 0.5841495992876224, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.36418342590332, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8403331637382507, "num_tokens": 175292142.0, "step": 4592 }, { "epoch": 0.584276809566213, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.091768264770508, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8457633256912231, "num_tokens": 175336630.0, "step": 4593 }, { "epoch": 0.5844040198448035, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.62068748474121, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8629244565963745, "num_tokens": 175372697.0, "step": 4594 }, { "epoch": 0.5845312301233939, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.32295799255371, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8404188752174377, "num_tokens": 175412395.0, "step": 4595 }, { "epoch": 0.5846584404019844, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.258872985839844, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8515124917030334, "num_tokens": 175447295.0, "step": 4596 }, { "epoch": 0.584785650680575, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.756778717041016, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8629841804504395, "num_tokens": 175480229.0, "step": 4597 }, { "epoch": 0.5849128609591655, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 30.233314514160156, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.851747989654541, "num_tokens": 175523081.0, "step": 4598 }, { "epoch": 0.585040071237756, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.439878463745117, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8559789657592773, "num_tokens": 175565036.0, "step": 4599 }, { "epoch": 0.5851672815163466, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.565683364868164, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8525484800338745, "num_tokens": 175606733.0, "step": 4600 }, { "epoch": 0.585294491794937, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.59551239013672, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8690522909164429, "num_tokens": 175637663.0, "step": 4601 }, { "epoch": 0.5854217020735275, "ewc_loss": 0.06689453125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.269050598144531e-05, "grad_norm": 29.197410583496094, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8537185192108154, "num_tokens": 175677381.0, "step": 4602 }, { "epoch": 0.585548912352118, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.704153060913086, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.863226056098938, "num_tokens": 175713049.0, "step": 4603 }, { "epoch": 0.5856761226307086, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.425403594970703, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8433173894882202, "num_tokens": 175753681.0, "step": 4604 }, { "epoch": 0.5858033329092991, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.893739700317383, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8404905200004578, "num_tokens": 175789725.0, "step": 4605 }, { "epoch": 0.5859305431878896, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.109283447265625, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8518339395523071, "num_tokens": 175823203.0, "step": 4606 }, { "epoch": 0.5860577534664801, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.347816467285156, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8464400172233582, "num_tokens": 175860506.0, "step": 4607 }, { "epoch": 0.5861849637450706, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.100116729736328, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8693084716796875, "num_tokens": 175895080.0, "step": 4608 }, { "epoch": 0.5863121740236611, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.448680877685547, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8461145162582397, "num_tokens": 175937113.0, "step": 4609 }, { "epoch": 0.5864393843022516, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.200401306152344, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.866122841835022, "num_tokens": 175966193.0, "step": 4610 }, { "epoch": 0.5865665945808421, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.418628692626953, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8449641466140747, "num_tokens": 176003771.0, "step": 4611 }, { "epoch": 0.5866938048594327, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.349729537963867, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8575639724731445, "num_tokens": 176050069.0, "step": 4612 }, { "epoch": 0.5868210151380232, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.653133392333984, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8666058778762817, "num_tokens": 176088763.0, "step": 4613 }, { "epoch": 0.5869482254166136, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.316734313964844e-05, "grad_norm": 29.34322738647461, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8546545505523682, "num_tokens": 176127853.0, "step": 4614 }, { "epoch": 0.5870754356952042, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.653100967407227, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8399168252944946, "num_tokens": 176173775.0, "step": 4615 }, { "epoch": 0.5872026459737947, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.340576171875e-05, "grad_norm": 29.141033172607422, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8561580181121826, "num_tokens": 176210878.0, "step": 4616 }, { "epoch": 0.5873298562523852, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.63969612121582, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8468506932258606, "num_tokens": 176255094.0, "step": 4617 }, { "epoch": 0.5874570665309757, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 28.965761184692383, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8500440120697021, "num_tokens": 176293586.0, "step": 4618 }, { "epoch": 0.5875842768095663, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.743562698364258, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8561103343963623, "num_tokens": 176329203.0, "step": 4619 }, { "epoch": 0.5877114870881567, "ewc_loss": 0.0673828125, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.2928924560546875e-05, "grad_norm": 29.310379028320312, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8702265024185181, "num_tokens": 176367984.0, "step": 4620 }, { "epoch": 0.5878386973667472, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.021970748901367, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8654338121414185, "num_tokens": 176408949.0, "step": 4621 }, { "epoch": 0.5879659076453377, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.397001266479492, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8687279224395752, "num_tokens": 176451073.0, "step": 4622 }, { "epoch": 0.5880931179239283, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.005990982055664, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8617252111434937, "num_tokens": 176490623.0, "step": 4623 }, { "epoch": 0.5882203282025188, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.08694076538086, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8398772478103638, "num_tokens": 176525771.0, "step": 4624 }, { "epoch": 0.5883475384811093, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.0355281829834, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.871705949306488, "num_tokens": 176565417.0, "step": 4625 }, { "epoch": 0.5884747487596997, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.04623794555664, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8505153656005859, "num_tokens": 176599005.0, "step": 4626 }, { "epoch": 0.5886019590382903, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.1683292388916, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8545832633972168, "num_tokens": 176637947.0, "step": 4627 }, { "epoch": 0.5887291693168808, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.064760208129883, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8435285091400146, "num_tokens": 176675241.0, "step": 4628 }, { "epoch": 0.5888563795954713, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.233646392822266, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8475397825241089, "num_tokens": 176715375.0, "step": 4629 }, { "epoch": 0.5889835898740619, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 28.993091583251953, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8530815243721008, "num_tokens": 176750977.0, "step": 4630 }, { "epoch": 0.5891108001526524, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.23003578186035, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8563159704208374, "num_tokens": 176788968.0, "step": 4631 }, { "epoch": 0.5892380104312428, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.168575286865234, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8501976728439331, "num_tokens": 176824399.0, "step": 4632 }, { "epoch": 0.5893652207098333, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.96142578125, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8472137451171875, "num_tokens": 176862053.0, "step": 4633 }, { "epoch": 0.5894924309884239, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.544788360595703, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8671286106109619, "num_tokens": 176896943.0, "step": 4634 }, { "epoch": 0.5896196412670144, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.027664184570312, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8525718450546265, "num_tokens": 176936263.0, "step": 4635 }, { "epoch": 0.5897468515456049, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.410717010498047, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8426446914672852, "num_tokens": 176976632.0, "step": 4636 }, { "epoch": 0.5898740618241954, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.199201583862305, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8583772778511047, "num_tokens": 177007036.0, "step": 4637 }, { "epoch": 0.5900012721027859, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.23175621032715, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8438199758529663, "num_tokens": 177042189.0, "step": 4638 }, { "epoch": 0.5901284823813764, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.3704776763916, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8468664884567261, "num_tokens": 177082428.0, "step": 4639 }, { "epoch": 0.5902556926599669, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 28.995826721191406, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8547875881195068, "num_tokens": 177124447.0, "step": 4640 }, { "epoch": 0.5903829029385574, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.245609283447266, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8686354160308838, "num_tokens": 177160516.0, "step": 4641 }, { "epoch": 0.590510113217148, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.020565032958984, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8603851795196533, "num_tokens": 177198100.0, "step": 4642 }, { "epoch": 0.5906373234957385, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.423500061035156, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8464682698249817, "num_tokens": 177243170.0, "step": 4643 }, { "epoch": 0.5907645337743289, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.000028610229492, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.849717915058136, "num_tokens": 177282976.0, "step": 4644 }, { "epoch": 0.5908917440529194, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.191986083984375, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8359503149986267, "num_tokens": 177323725.0, "step": 4645 }, { "epoch": 0.59101895433151, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.039793014526367, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8530636429786682, "num_tokens": 177352178.0, "step": 4646 }, { "epoch": 0.5911461646101005, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.025615692138672, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8498276472091675, "num_tokens": 177392322.0, "step": 4647 }, { "epoch": 0.591273374888691, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.17028045654297, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8518922328948975, "num_tokens": 177432013.0, "step": 4648 }, { "epoch": 0.5914005851672816, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 28.85506248474121, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8662376403808594, "num_tokens": 177473542.0, "step": 4649 }, { "epoch": 0.591527795445872, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.039133071899414, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8490769863128662, "num_tokens": 177516590.0, "step": 4650 }, { "epoch": 0.5916550057244625, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.894620895385742, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8661496639251709, "num_tokens": 177553750.0, "step": 4651 }, { "epoch": 0.591782216003053, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.468961715698242, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8462142944335938, "num_tokens": 177586093.0, "step": 4652 }, { "epoch": 0.5919094262816436, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 28.883581161499023, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8497174978256226, "num_tokens": 177626800.0, "step": 4653 }, { "epoch": 0.5920366365602341, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.39068603515625, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8512121438980103, "num_tokens": 177662336.0, "step": 4654 }, { "epoch": 0.5921638468388246, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 28.97658348083496, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8609434962272644, "num_tokens": 177706922.0, "step": 4655 }, { "epoch": 0.592291057117415, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.4133243560791, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8510051965713501, "num_tokens": 177743922.0, "step": 4656 }, { "epoch": 0.5924182673960056, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.925146102905273, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8597456216812134, "num_tokens": 177777069.0, "step": 4657 }, { "epoch": 0.5925454776745961, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 30.706769943237305, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8610151410102844, "num_tokens": 177810959.0, "step": 4658 }, { "epoch": 0.5926726879531866, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.6392879486084, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8441473245620728, "num_tokens": 177846366.0, "step": 4659 }, { "epoch": 0.5927998982317771, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.824275970458984, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.865009605884552, "num_tokens": 177881450.0, "step": 4660 }, { "epoch": 0.5929271085103677, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.233749389648438, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8537915945053101, "num_tokens": 177922931.0, "step": 4661 }, { "epoch": 0.5930543187889582, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.524389266967773, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8361634016036987, "num_tokens": 177962722.0, "step": 4662 }, { "epoch": 0.5931815290675486, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.68866539001465, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8503554463386536, "num_tokens": 177999956.0, "step": 4663 }, { "epoch": 0.5933087393461391, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.342622756958008, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8426238298416138, "num_tokens": 178032450.0, "step": 4664 }, { "epoch": 0.5934359496247297, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.40974998474121, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8532947301864624, "num_tokens": 178075028.0, "step": 4665 }, { "epoch": 0.5935631599033202, "ewc_loss": 0.06787109375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.364418029785156e-05, "grad_norm": 29.258630752563477, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8482011556625366, "num_tokens": 178108267.0, "step": 4666 }, { "epoch": 0.5936903701819107, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.951601028442383, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8521207571029663, "num_tokens": 178148091.0, "step": 4667 }, { "epoch": 0.5938175804605013, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.3882598876953125e-05, "grad_norm": 29.211605072021484, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.867210865020752, "num_tokens": 178185626.0, "step": 4668 }, { "epoch": 0.5939447907390917, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 28.797042846679688, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8613139390945435, "num_tokens": 178224165.0, "step": 4669 }, { "epoch": 0.5940720010176822, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.29547119140625, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8518544435501099, "num_tokens": 178264118.0, "step": 4670 }, { "epoch": 0.5941992112962727, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 29.189573287963867, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8640899658203125, "num_tokens": 178295634.0, "step": 4671 }, { "epoch": 0.5943264215748633, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.324501037597656, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8590024709701538, "num_tokens": 178327076.0, "step": 4672 }, { "epoch": 0.5944536318534538, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.282331466674805, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8447972536087036, "num_tokens": 178357469.0, "step": 4673 }, { "epoch": 0.5945808421320443, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.525005340576172, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8589694499969482, "num_tokens": 178393830.0, "step": 4674 }, { "epoch": 0.5947080524106347, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.927734375, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8533618450164795, "num_tokens": 178427560.0, "step": 4675 }, { "epoch": 0.5948352626892253, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.463275909423828, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8541730642318726, "num_tokens": 178464666.0, "step": 4676 }, { "epoch": 0.5949624729678158, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.80921745300293, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8430435657501221, "num_tokens": 178508771.0, "step": 4677 }, { "epoch": 0.5950896832464063, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.256196975708008, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8565899729728699, "num_tokens": 178548516.0, "step": 4678 }, { "epoch": 0.5952168935249968, "ewc_loss": 0.068359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.412101745605469e-05, "grad_norm": 28.86734962463379, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8186594247817993, "num_tokens": 178591276.0, "step": 4679 }, { "epoch": 0.5953441038035874, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.290803909301758, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8516979217529297, "num_tokens": 178632580.0, "step": 4680 }, { "epoch": 0.5954713140821778, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 29.082618713378906, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8484182357788086, "num_tokens": 178672858.0, "step": 4681 }, { "epoch": 0.5955985243607683, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.37086296081543, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8532971143722534, "num_tokens": 178706435.0, "step": 4682 }, { "epoch": 0.5957257346393589, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.246623992919922, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8502784967422485, "num_tokens": 178746309.0, "step": 4683 }, { "epoch": 0.5958529449179494, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.465740203857422, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8335645794868469, "num_tokens": 178787639.0, "step": 4684 }, { "epoch": 0.5959801551965399, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.076555252075195, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8568195104598999, "num_tokens": 178828271.0, "step": 4685 }, { "epoch": 0.5961073654751304, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.319744110107422, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8734712600708008, "num_tokens": 178862862.0, "step": 4686 }, { "epoch": 0.5962345757537209, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.11566734313965, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.865630030632019, "num_tokens": 178897969.0, "step": 4687 }, { "epoch": 0.5963617860323114, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.219457626342773, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8582295179367065, "num_tokens": 178942002.0, "step": 4688 }, { "epoch": 0.5964889963109019, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.411230087280273, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.86336350440979, "num_tokens": 178983473.0, "step": 4689 }, { "epoch": 0.5966162065894924, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.116392135620117, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.841838002204895, "num_tokens": 179022370.0, "step": 4690 }, { "epoch": 0.596743416868083, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.500930786132812, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8674815893173218, "num_tokens": 179064841.0, "step": 4691 }, { "epoch": 0.5968706271466735, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.077470779418945, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8250524997711182, "num_tokens": 179099399.0, "step": 4692 }, { "epoch": 0.5969978374252639, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.696035385131836, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8436599969863892, "num_tokens": 179133969.0, "step": 4693 }, { "epoch": 0.5971250477038544, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 28.83831214904785, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8499218225479126, "num_tokens": 179165456.0, "step": 4694 }, { "epoch": 0.597252257982445, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.450048446655273, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8556421995162964, "num_tokens": 179198209.0, "step": 4695 }, { "epoch": 0.5973794682610355, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 28.950918197631836, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8390833139419556, "num_tokens": 179233872.0, "step": 4696 }, { "epoch": 0.597506678539626, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.507442474365234, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8741476535797119, "num_tokens": 179272721.0, "step": 4697 }, { "epoch": 0.5976338888182166, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 28.87395668029785, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8515580892562866, "num_tokens": 179305781.0, "step": 4698 }, { "epoch": 0.597761099096807, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.481191635131836, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8578674793243408, "num_tokens": 179350049.0, "step": 4699 }, { "epoch": 0.5978883093753975, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.838459014892578, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8533713817596436, "num_tokens": 179388063.0, "step": 4700 }, { "epoch": 0.598015519653988, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.316919326782227, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8614829778671265, "num_tokens": 179427607.0, "step": 4701 }, { "epoch": 0.5981427299325786, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 28.961687088012695, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.835915207862854, "num_tokens": 179471456.0, "step": 4702 }, { "epoch": 0.5982699402111691, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.305896759033203, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8640416860580444, "num_tokens": 179514318.0, "step": 4703 }, { "epoch": 0.5983971504897596, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 28.807119369506836, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8671778440475464, "num_tokens": 179553307.0, "step": 4704 }, { "epoch": 0.59852436076835, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.335622787475586, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8459130525588989, "num_tokens": 179597288.0, "step": 4705 }, { "epoch": 0.5986515710469406, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 28.87867546081543, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8529382944107056, "num_tokens": 179629583.0, "step": 4706 }, { "epoch": 0.5987787813255311, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.142038345336914, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8473484516143799, "num_tokens": 179672104.0, "step": 4707 }, { "epoch": 0.5989059916041216, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.36260223388672, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.843314528465271, "num_tokens": 179712951.0, "step": 4708 }, { "epoch": 0.5990332018827121, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 28.953227996826172, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8514455556869507, "num_tokens": 179754626.0, "step": 4709 }, { "epoch": 0.5991604121613027, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.22020149230957, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8576550483703613, "num_tokens": 179790068.0, "step": 4710 }, { "epoch": 0.5992876224398932, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.195892333984375, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8353205919265747, "num_tokens": 179823936.0, "step": 4711 }, { "epoch": 0.5994148327184836, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 28.96678924560547, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8542736768722534, "num_tokens": 179860920.0, "step": 4712 }, { "epoch": 0.5995420429970741, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.021926879882812, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8423850536346436, "num_tokens": 179899848.0, "step": 4713 }, { "epoch": 0.5996692532756647, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.07783317565918, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8696063756942749, "num_tokens": 179934462.0, "step": 4714 }, { "epoch": 0.5997964635542552, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.40217399597168, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8530436158180237, "num_tokens": 179973955.0, "step": 4715 }, { "epoch": 0.5999236738328457, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.249921798706055, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8550348281860352, "num_tokens": 180015256.0, "step": 4716 }, { "epoch": 0.6000508841114363, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.244171142578125, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8610081672668457, "num_tokens": 180054145.0, "step": 4717 }, { "epoch": 0.6001780943900267, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.428857803344727, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.857609212398529, "num_tokens": 180093411.0, "step": 4718 }, { "epoch": 0.6003053046686172, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.168167114257812, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8627853393554688, "num_tokens": 180135799.0, "step": 4719 }, { "epoch": 0.6004325149472077, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.620025634765625, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8463790416717529, "num_tokens": 180172623.0, "step": 4720 }, { "epoch": 0.6005597252257983, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.241411209106445, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8505935072898865, "num_tokens": 180213833.0, "step": 4721 }, { "epoch": 0.6006869355043888, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.19730567932129, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8594522476196289, "num_tokens": 180254982.0, "step": 4722 }, { "epoch": 0.6008141457829793, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.2451114654541, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8608770370483398, "num_tokens": 180288544.0, "step": 4723 }, { "epoch": 0.6009413560615697, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.093374252319336, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.850227415561676, "num_tokens": 180330233.0, "step": 4724 }, { "epoch": 0.6010685663401603, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.204822540283203, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8571969270706177, "num_tokens": 180361516.0, "step": 4725 }, { "epoch": 0.6011957766187508, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.286623001098633, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8516829609870911, "num_tokens": 180404380.0, "step": 4726 }, { "epoch": 0.6013229868973413, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.161853790283203, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8457635641098022, "num_tokens": 180436916.0, "step": 4727 }, { "epoch": 0.6014501971759318, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.217966079711914, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8370523452758789, "num_tokens": 180476895.0, "step": 4728 }, { "epoch": 0.6015774074545224, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.459785461425781e-05, "grad_norm": 29.71558380126953, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8537379503250122, "num_tokens": 180515248.0, "step": 4729 }, { "epoch": 0.6017046177331128, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 28.896467208862305, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8603551387786865, "num_tokens": 180554575.0, "step": 4730 }, { "epoch": 0.6018318280117033, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.746700286865234, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8445637226104736, "num_tokens": 180594485.0, "step": 4731 }, { "epoch": 0.6019590382902938, "ewc_loss": 0.06884765625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.435943603515625e-05, "grad_norm": 28.880041122436523, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8391321897506714, "num_tokens": 180634746.0, "step": 4732 }, { "epoch": 0.6020862485688844, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.604454040527344, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.844171941280365, "num_tokens": 180674946.0, "step": 4733 }, { "epoch": 0.6022134588474749, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 28.993967056274414, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8536425828933716, "num_tokens": 180715538.0, "step": 4734 }, { "epoch": 0.6023406691260654, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.412694931030273, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8514217138290405, "num_tokens": 180755298.0, "step": 4735 }, { "epoch": 0.6024678794046558, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.432861328125, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8467897772789001, "num_tokens": 180797104.0, "step": 4736 }, { "epoch": 0.6025950896832464, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.191795349121094, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8643823266029358, "num_tokens": 180839845.0, "step": 4737 }, { "epoch": 0.6027222999618369, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.485620498657227, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.86674964427948, "num_tokens": 180878849.0, "step": 4738 }, { "epoch": 0.6028495102404274, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.557479858398438, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8517031669616699, "num_tokens": 180918427.0, "step": 4739 }, { "epoch": 0.602976720519018, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.368186950683594, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8457924723625183, "num_tokens": 180956206.0, "step": 4740 }, { "epoch": 0.6031039307976085, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.431638717651367, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8595393300056458, "num_tokens": 180996945.0, "step": 4741 }, { "epoch": 0.6032311410761989, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.28813934326172, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8577854633331299, "num_tokens": 181031763.0, "step": 4742 }, { "epoch": 0.6033583513547894, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.848125457763672, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8613031506538391, "num_tokens": 181070661.0, "step": 4743 }, { "epoch": 0.60348556163338, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.23101043701172, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8403787016868591, "num_tokens": 181112769.0, "step": 4744 }, { "epoch": 0.6036127719119705, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.68605613708496, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8542240262031555, "num_tokens": 181151155.0, "step": 4745 }, { "epoch": 0.603739982190561, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.17193603515625, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8538030385971069, "num_tokens": 181188689.0, "step": 4746 }, { "epoch": 0.6038671924691515, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.49669075012207, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8577969074249268, "num_tokens": 181230031.0, "step": 4747 }, { "epoch": 0.603994402747742, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.16537857055664, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8541441559791565, "num_tokens": 181271464.0, "step": 4748 }, { "epoch": 0.6041216130263325, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.627954483032227, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8475428223609924, "num_tokens": 181312607.0, "step": 4749 }, { "epoch": 0.604248823304923, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.00910186767578, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.861459493637085, "num_tokens": 181348628.0, "step": 4750 }, { "epoch": 0.6043760335835135, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.946956634521484, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8467566967010498, "num_tokens": 181387975.0, "step": 4751 }, { "epoch": 0.6045032438621041, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.4836273193359375e-05, "grad_norm": 29.19537925720215, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8672912120819092, "num_tokens": 181424163.0, "step": 4752 }, { "epoch": 0.6046304541406946, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.51374626159668, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8433965444564819, "num_tokens": 181457002.0, "step": 4753 }, { "epoch": 0.604757664419285, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.203519821166992, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8487821221351624, "num_tokens": 181494410.0, "step": 4754 }, { "epoch": 0.6048848746978756, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.58846092224121, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8515382409095764, "num_tokens": 181534595.0, "step": 4755 }, { "epoch": 0.6050120849764661, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.123748779296875, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8589917421340942, "num_tokens": 181570654.0, "step": 4756 }, { "epoch": 0.6051392952550566, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.362442016601562, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8480429649353027, "num_tokens": 181608366.0, "step": 4757 }, { "epoch": 0.6052665055336471, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.221221923828125, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8418663740158081, "num_tokens": 181647168.0, "step": 4758 }, { "epoch": 0.6053937158122377, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.86635398864746, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8565675020217896, "num_tokens": 181689177.0, "step": 4759 }, { "epoch": 0.6055209260908282, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.111927032470703, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8582611083984375, "num_tokens": 181723742.0, "step": 4760 }, { "epoch": 0.6056481363694186, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.45505142211914, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8625929355621338, "num_tokens": 181761539.0, "step": 4761 }, { "epoch": 0.6057753466480091, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.185932159423828, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8516647219657898, "num_tokens": 181799674.0, "step": 4762 }, { "epoch": 0.6059025569265997, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.509319305419922, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.840579092502594, "num_tokens": 181838784.0, "step": 4763 }, { "epoch": 0.6060297672051902, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.074047088623047, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8496975302696228, "num_tokens": 181875669.0, "step": 4764 }, { "epoch": 0.6061569774837807, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.66439437866211, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8366830348968506, "num_tokens": 181912450.0, "step": 4765 }, { "epoch": 0.6062841877623713, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.180885314941406, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8529523611068726, "num_tokens": 181947691.0, "step": 4766 }, { "epoch": 0.6064113980409617, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.600427627563477, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.858676552772522, "num_tokens": 181982364.0, "step": 4767 }, { "epoch": 0.6065386083195522, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.339609146118164, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8536307215690613, "num_tokens": 182017071.0, "step": 4768 }, { "epoch": 0.6066658185981427, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.334552764892578, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8532872200012207, "num_tokens": 182057234.0, "step": 4769 }, { "epoch": 0.6067930288767333, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.40603256225586, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8649959564208984, "num_tokens": 182092450.0, "step": 4770 }, { "epoch": 0.6069202391553238, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.233457565307617, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8452920913696289, "num_tokens": 182126398.0, "step": 4771 }, { "epoch": 0.6070474494339143, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.50637435913086, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8667956590652466, "num_tokens": 182160848.0, "step": 4772 }, { "epoch": 0.6071746597125047, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.21294403076172, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8614009618759155, "num_tokens": 182202596.0, "step": 4773 }, { "epoch": 0.6073018699910953, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.247135162353516, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8516851663589478, "num_tokens": 182239235.0, "step": 4774 }, { "epoch": 0.6074290802696858, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.27301597595215, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.865735650062561, "num_tokens": 182273662.0, "step": 4775 }, { "epoch": 0.6075562905482763, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.280601501464844, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8383680582046509, "num_tokens": 182311991.0, "step": 4776 }, { "epoch": 0.6076835008268668, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.58233642578125, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.854854941368103, "num_tokens": 182350722.0, "step": 4777 }, { "epoch": 0.6078107111054574, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.067211151123047, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.856040894985199, "num_tokens": 182386755.0, "step": 4778 }, { "epoch": 0.6079379213840478, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.485605239868164, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8698887825012207, "num_tokens": 182423128.0, "step": 4779 }, { "epoch": 0.6080651316626383, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.34040641784668, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.839407205581665, "num_tokens": 182461518.0, "step": 4780 }, { "epoch": 0.6081923419412288, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.42516326904297, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8529020547866821, "num_tokens": 182495707.0, "step": 4781 }, { "epoch": 0.6083195522198194, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.253000259399414, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8449897766113281, "num_tokens": 182539058.0, "step": 4782 }, { "epoch": 0.6084467624984099, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.486249923706055, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8406810164451599, "num_tokens": 182572482.0, "step": 4783 }, { "epoch": 0.6085739727770004, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.270549774169922, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8574908971786499, "num_tokens": 182607665.0, "step": 4784 }, { "epoch": 0.6087011830555908, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.411235809326172, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.855300784111023, "num_tokens": 182650503.0, "step": 4785 }, { "epoch": 0.6088283933341814, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.341691970825195, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8640577793121338, "num_tokens": 182681662.0, "step": 4786 }, { "epoch": 0.6089556036127719, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.404516220092773, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8416976928710938, "num_tokens": 182714912.0, "step": 4787 }, { "epoch": 0.6090828138913624, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.113826751708984, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8505507707595825, "num_tokens": 182748813.0, "step": 4788 }, { "epoch": 0.609210024169953, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.649978637695312, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8376586437225342, "num_tokens": 182786123.0, "step": 4789 }, { "epoch": 0.6093372344485435, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.0656795501709, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8671463131904602, "num_tokens": 182820550.0, "step": 4790 }, { "epoch": 0.6094644447271339, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.630470275878906, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8543663024902344, "num_tokens": 182863135.0, "step": 4791 }, { "epoch": 0.6095916550057244, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.001388549804688, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.855652391910553, "num_tokens": 182905193.0, "step": 4792 }, { "epoch": 0.609718865284315, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.947189331054688, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8542637825012207, "num_tokens": 182944934.0, "step": 4793 }, { "epoch": 0.6098460755629055, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.4424324035644531e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.401588439941406, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8405767679214478, "num_tokens": 182984604.0, "step": 4794 }, { "epoch": 0.609973285841496, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.56723403930664, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8512191772460938, "num_tokens": 183020421.0, "step": 4795 }, { "epoch": 0.6101004961200865, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.69944190979004, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8443362712860107, "num_tokens": 183057635.0, "step": 4796 }, { "epoch": 0.610227706398677, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.652870178222656, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8415511846542358, "num_tokens": 183094689.0, "step": 4797 }, { "epoch": 0.6103549166772675, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.060152053833008, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8592585921287537, "num_tokens": 183129118.0, "step": 4798 }, { "epoch": 0.610482126955858, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.51927947998047, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8518162965774536, "num_tokens": 183163365.0, "step": 4799 }, { "epoch": 0.6106093372344485, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.301198959350586, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8574374914169312, "num_tokens": 183199271.0, "step": 4800 }, { "epoch": 0.6107365475130391, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.393335342407227, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8730669617652893, "num_tokens": 183234523.0, "step": 4801 }, { "epoch": 0.6108637577916296, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.645448684692383, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8546285033226013, "num_tokens": 183267098.0, "step": 4802 }, { "epoch": 0.61099096807022, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.440326690673828, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8526945114135742, "num_tokens": 183306825.0, "step": 4803 }, { "epoch": 0.6111181783488105, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.389297485351562, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.853945255279541, "num_tokens": 183348712.0, "step": 4804 }, { "epoch": 0.6112453886274011, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.284770965576172, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8672491908073425, "num_tokens": 183383797.0, "step": 4805 }, { "epoch": 0.6113725989059916, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.485532760620117, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8661801815032959, "num_tokens": 183417484.0, "step": 4806 }, { "epoch": 0.6114998091845821, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.41054916381836, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8604018092155457, "num_tokens": 183453808.0, "step": 4807 }, { "epoch": 0.6116270194631727, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.383716583251953, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8606062531471252, "num_tokens": 183496693.0, "step": 4808 }, { "epoch": 0.6117542297417632, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.394460678100586, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8576568365097046, "num_tokens": 183536727.0, "step": 4809 }, { "epoch": 0.6118814400203536, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.739517211914062, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8544932007789612, "num_tokens": 183571809.0, "step": 4810 }, { "epoch": 0.6120086502989441, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.753820419311523, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8522706627845764, "num_tokens": 183610761.0, "step": 4811 }, { "epoch": 0.6121358605775347, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.356536865234375, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8523613810539246, "num_tokens": 183648304.0, "step": 4812 }, { "epoch": 0.6122630708561252, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.726699829101562, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8486883640289307, "num_tokens": 183684201.0, "step": 4813 }, { "epoch": 0.6123902811347157, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.579626083374023, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8478509187698364, "num_tokens": 183726595.0, "step": 4814 }, { "epoch": 0.6125174914133062, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.44601821899414, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8630414009094238, "num_tokens": 183762222.0, "step": 4815 }, { "epoch": 0.6126447016918967, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.621124267578125, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.867475688457489, "num_tokens": 183797167.0, "step": 4816 }, { "epoch": 0.6127719119704872, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.606565475463867, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8536834120750427, "num_tokens": 183839591.0, "step": 4817 }, { "epoch": 0.6128991222490777, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.841108322143555, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8520611524581909, "num_tokens": 183877862.0, "step": 4818 }, { "epoch": 0.6130263325276682, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.106847763061523, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8588461875915527, "num_tokens": 183915964.0, "step": 4819 }, { "epoch": 0.6131535428062588, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.918371200561523, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8508756160736084, "num_tokens": 183952823.0, "step": 4820 }, { "epoch": 0.6132807530848493, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.280994415283203, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8454260230064392, "num_tokens": 183991650.0, "step": 4821 }, { "epoch": 0.6134079633634397, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.691648483276367, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.855818510055542, "num_tokens": 184027768.0, "step": 4822 }, { "epoch": 0.6135351736420303, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.53131103515625e-05, "grad_norm": 29.347654342651367, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8646047711372375, "num_tokens": 184063754.0, "step": 4823 }, { "epoch": 0.6136623839206208, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.78406524658203, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8480722308158875, "num_tokens": 184105286.0, "step": 4824 }, { "epoch": 0.6137895941992113, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.098894119262695, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8587363958358765, "num_tokens": 184139312.0, "step": 4825 }, { "epoch": 0.6139168044778018, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.781660079956055, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8543450832366943, "num_tokens": 184181541.0, "step": 4826 }, { "epoch": 0.6140440147563924, "ewc_loss": 0.0693359375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.507469177246094e-05, "grad_norm": 29.378366470336914, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8538517951965332, "num_tokens": 184216857.0, "step": 4827 }, { "epoch": 0.6141712250349828, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.7189998626709, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8574560284614563, "num_tokens": 184253318.0, "step": 4828 }, { "epoch": 0.6142984353135733, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.818073272705078, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8656597137451172, "num_tokens": 184298625.0, "step": 4829 }, { "epoch": 0.6144256455921638, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.176576614379883, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8518896102905273, "num_tokens": 184334569.0, "step": 4830 }, { "epoch": 0.6145528558707544, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.697736740112305, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8370453715324402, "num_tokens": 184377477.0, "step": 4831 }, { "epoch": 0.6146800661493449, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.558263778686523, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8409020304679871, "num_tokens": 184416671.0, "step": 4832 }, { "epoch": 0.6148072764279354, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.804845809936523, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8486108779907227, "num_tokens": 184453174.0, "step": 4833 }, { "epoch": 0.6149344867065258, "ewc_loss": 0.06982421875, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.555152893066406e-05, "grad_norm": 29.590574264526367, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8608929514884949, "num_tokens": 184488313.0, "step": 4834 }, { "epoch": 0.6150616969851164, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.527414321899414, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8360493183135986, "num_tokens": 184523733.0, "step": 4835 }, { "epoch": 0.6151889072637069, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.47931671142578, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8508851528167725, "num_tokens": 184568399.0, "step": 4836 }, { "epoch": 0.6153161175422974, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.4424324035644531e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.315664291381836, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8533986806869507, "num_tokens": 184603302.0, "step": 4837 }, { "epoch": 0.615443327820888, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.655641555786133, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8731194138526917, "num_tokens": 184637346.0, "step": 4838 }, { "epoch": 0.6155705380994785, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 28.999109268188477, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.860059380531311, "num_tokens": 184671687.0, "step": 4839 }, { "epoch": 0.6156977483780689, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.528776168823242, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8641327619552612, "num_tokens": 184706867.0, "step": 4840 }, { "epoch": 0.6158249586566594, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.161361694335938, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8503459692001343, "num_tokens": 184743916.0, "step": 4841 }, { "epoch": 0.61595216893525, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.821189880371094, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.862055778503418, "num_tokens": 184783247.0, "step": 4842 }, { "epoch": 0.6160793792138405, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.316608428955078, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8539990186691284, "num_tokens": 184826501.0, "step": 4843 }, { "epoch": 0.616206589492431, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.69183921813965, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8527885675430298, "num_tokens": 184868743.0, "step": 4844 }, { "epoch": 0.6163337997710215, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.237548828125, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8506999611854553, "num_tokens": 184908992.0, "step": 4845 }, { "epoch": 0.616461010049612, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.56280517578125, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8422662019729614, "num_tokens": 184942456.0, "step": 4846 }, { "epoch": 0.6165882203282025, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.024818420410156, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8428722620010376, "num_tokens": 184975834.0, "step": 4847 }, { "epoch": 0.616715430606793, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.738353729248047, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8768861293792725, "num_tokens": 185012295.0, "step": 4848 }, { "epoch": 0.6168426408853835, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.26630973815918, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.858452558517456, "num_tokens": 185049272.0, "step": 4849 }, { "epoch": 0.6169698511639741, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.59766387939453, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8654869794845581, "num_tokens": 185081054.0, "step": 4850 }, { "epoch": 0.6170970614425646, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.368106842041016, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8505186438560486, "num_tokens": 185119809.0, "step": 4851 }, { "epoch": 0.617224271721155, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.617692947387695, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8648091554641724, "num_tokens": 185161462.0, "step": 4852 }, { "epoch": 0.6173514819997455, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.611852645874023, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8454618453979492, "num_tokens": 185196238.0, "step": 4853 }, { "epoch": 0.6174786922783361, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.375768661499023, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8539411425590515, "num_tokens": 185239317.0, "step": 4854 }, { "epoch": 0.6176059025569266, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.248950958251953, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8640775680541992, "num_tokens": 185273555.0, "step": 4855 }, { "epoch": 0.6177331128355171, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.741291046142578, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8613690137863159, "num_tokens": 185309721.0, "step": 4856 }, { "epoch": 0.6178603231141077, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.382522583007812, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8456189036369324, "num_tokens": 185350846.0, "step": 4857 }, { "epoch": 0.6179875333926982, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.527725219726562, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8471416234970093, "num_tokens": 185392472.0, "step": 4858 }, { "epoch": 0.6181147436712886, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.67548942565918, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.850841760635376, "num_tokens": 185438250.0, "step": 4859 }, { "epoch": 0.6182419539498791, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.387062072753906, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8507263660430908, "num_tokens": 185479455.0, "step": 4860 }, { "epoch": 0.6183691642284697, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.739913940429688, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8502902388572693, "num_tokens": 185518719.0, "step": 4861 }, { "epoch": 0.6184963745070602, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.19685935974121, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8383076190948486, "num_tokens": 185556671.0, "step": 4862 }, { "epoch": 0.6186235847856507, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.578920364379883, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8512268662452698, "num_tokens": 185595063.0, "step": 4863 }, { "epoch": 0.6187507950642412, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.390167236328125, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.845150351524353, "num_tokens": 185635626.0, "step": 4864 }, { "epoch": 0.6188780053428317, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.61199951171875, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8423376083374023, "num_tokens": 185671687.0, "step": 4865 }, { "epoch": 0.6190052156214222, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.323101043701172, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8572519421577454, "num_tokens": 185706459.0, "step": 4866 }, { "epoch": 0.6191324259000127, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 30.04022216796875, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8729473948478699, "num_tokens": 185743162.0, "step": 4867 }, { "epoch": 0.6192596361786032, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.543405532836914, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.838249921798706, "num_tokens": 185784036.0, "step": 4868 }, { "epoch": 0.6193868464571938, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.543743133544922, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8511894941329956, "num_tokens": 185821407.0, "step": 4869 }, { "epoch": 0.6195140567357843, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.444698333740234, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8592398166656494, "num_tokens": 185857789.0, "step": 4870 }, { "epoch": 0.6196412670143747, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.926937103271484, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8655530214309692, "num_tokens": 185896989.0, "step": 4871 }, { "epoch": 0.6197684772929652, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.564868927001953, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8691521286964417, "num_tokens": 185938123.0, "step": 4872 }, { "epoch": 0.6198956875715558, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.35624885559082, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8530301451683044, "num_tokens": 185973440.0, "step": 4873 }, { "epoch": 0.6200228978501463, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.865055084228516, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8559500575065613, "num_tokens": 186010069.0, "step": 4874 }, { "epoch": 0.6201501081287368, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.563798904418945, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8470406532287598, "num_tokens": 186048365.0, "step": 4875 }, { "epoch": 0.6202773184073274, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.65727424621582, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8596242070198059, "num_tokens": 186085358.0, "step": 4876 }, { "epoch": 0.6204045286859178, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.538724899291992, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8589092493057251, "num_tokens": 186118476.0, "step": 4877 }, { "epoch": 0.6205317389645083, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.76141357421875, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8506021499633789, "num_tokens": 186155487.0, "step": 4878 }, { "epoch": 0.6206589492430988, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.606334686279297, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8606514930725098, "num_tokens": 186192941.0, "step": 4879 }, { "epoch": 0.6207861595216894, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.463579177856445, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8565044403076172, "num_tokens": 186233860.0, "step": 4880 }, { "epoch": 0.6209133698002799, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.78275489807129, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8433061242103577, "num_tokens": 186273081.0, "step": 4881 }, { "epoch": 0.6210405800788704, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.602836608886719e-05, "grad_norm": 29.608224868774414, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8597242832183838, "num_tokens": 186310596.0, "step": 4882 }, { "epoch": 0.6211677903574608, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.5745849609375, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8502087593078613, "num_tokens": 186346262.0, "step": 4883 }, { "epoch": 0.6212950006360514, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.45905303955078, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8573909401893616, "num_tokens": 186386508.0, "step": 4884 }, { "epoch": 0.6214222109146419, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.81973648071289, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8514399528503418, "num_tokens": 186425184.0, "step": 4885 }, { "epoch": 0.6215494211932324, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.5789947509765625e-05, "grad_norm": 29.393770217895508, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8531173467636108, "num_tokens": 186463072.0, "step": 4886 }, { "epoch": 0.621676631471823, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.646514892578125, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8649393320083618, "num_tokens": 186504062.0, "step": 4887 }, { "epoch": 0.6218038417504135, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.79523468017578, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8450772762298584, "num_tokens": 186544564.0, "step": 4888 }, { "epoch": 0.6219310520290039, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.289093017578125, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8400685787200928, "num_tokens": 186587196.0, "step": 4889 }, { "epoch": 0.6220582623075944, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.492324829101562, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8533957004547119, "num_tokens": 186620673.0, "step": 4890 }, { "epoch": 0.622185472586185, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.514921188354492, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8560056090354919, "num_tokens": 186654028.0, "step": 4891 }, { "epoch": 0.6223126828647755, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.529306411743164, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.851918637752533, "num_tokens": 186693395.0, "step": 4892 }, { "epoch": 0.622439893143366, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.766029357910156, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.836267352104187, "num_tokens": 186734440.0, "step": 4893 }, { "epoch": 0.6225671034219565, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.34507942199707, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8574833273887634, "num_tokens": 186769637.0, "step": 4894 }, { "epoch": 0.622694313700547, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.560348510742188, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8497828841209412, "num_tokens": 186810188.0, "step": 4895 }, { "epoch": 0.6228215239791375, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.496810913085938, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8433983325958252, "num_tokens": 186847512.0, "step": 4896 }, { "epoch": 0.622948734257728, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.661357879638672, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8626173734664917, "num_tokens": 186878721.0, "step": 4897 }, { "epoch": 0.6230759445363185, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.559101104736328, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8593008518218994, "num_tokens": 186920239.0, "step": 4898 }, { "epoch": 0.6232031548149091, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 30.088529586791992, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8466736078262329, "num_tokens": 186964158.0, "step": 4899 }, { "epoch": 0.6233303650934996, "ewc_loss": 0.0703125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.067453384399414, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8698640465736389, "num_tokens": 187003176.0, "step": 4900 }, { "epoch": 0.62345757537209, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 30.11174201965332, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.852927565574646, "num_tokens": 187046152.0, "step": 4901 }, { "epoch": 0.6235847856506805, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 5.650520324707031e-05, "grad_norm": 29.389986038208008, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8656548261642456, "num_tokens": 187086629.0, "step": 4902 }, { "epoch": 0.6237119959292711, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.93868637084961, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8479175567626953, "num_tokens": 187122856.0, "step": 4903 }, { "epoch": 0.6238392062078616, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.615055084228516, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8698946237564087, "num_tokens": 187161568.0, "step": 4904 }, { "epoch": 0.6239664164864521, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.698204040527344e-05, "grad_norm": 29.851327896118164, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8538305759429932, "num_tokens": 187202831.0, "step": 4905 }, { "epoch": 0.6240936267650427, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.47271156311035, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8608627915382385, "num_tokens": 187239059.0, "step": 4906 }, { "epoch": 0.6242208370436332, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.685680389404297, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8486489057540894, "num_tokens": 187272940.0, "step": 4907 }, { "epoch": 0.6243480473222236, "ewc_loss": 0.07080078125, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.626678466796875e-05, "grad_norm": 29.542253494262695, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8676611185073853, "num_tokens": 187310963.0, "step": 4908 }, { "epoch": 0.6244752576008141, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.750139236450195, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8731032013893127, "num_tokens": 187345698.0, "step": 4909 }, { "epoch": 0.6246024678794047, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.392024993896484, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8532125949859619, "num_tokens": 187384762.0, "step": 4910 }, { "epoch": 0.6247296781579952, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.81261444091797, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8433219194412231, "num_tokens": 187424288.0, "step": 4911 }, { "epoch": 0.6248568884365857, "ewc_loss": 0.0712890625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.6743621826171875e-05, "grad_norm": 29.27581024169922, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8633949756622314, "num_tokens": 187461481.0, "step": 4912 }, { "epoch": 0.6249840987151762, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.754873275756836, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8712760210037231, "num_tokens": 187497690.0, "step": 4913 }, { "epoch": 0.6251113089937667, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.763587951660156, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8472833633422852, "num_tokens": 187537825.0, "step": 4914 }, { "epoch": 0.6252385192723572, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.3840274810791, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8646165132522583, "num_tokens": 187583998.0, "step": 4915 }, { "epoch": 0.6253657295509477, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 30.116125106811523, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8531479835510254, "num_tokens": 187621962.0, "step": 4916 }, { "epoch": 0.6254929398295382, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.19110679626465, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8598397374153137, "num_tokens": 187655913.0, "step": 4917 }, { "epoch": 0.6256201501081288, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.835309982299805, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8455896377563477, "num_tokens": 187692433.0, "step": 4918 }, { "epoch": 0.6257473603867193, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.375761032104492, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8664073944091797, "num_tokens": 187724311.0, "step": 4919 }, { "epoch": 0.6258745706653097, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.908079147338867, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8633043766021729, "num_tokens": 187766614.0, "step": 4920 }, { "epoch": 0.6260017809439002, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.280323028564453, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.865818977355957, "num_tokens": 187804847.0, "step": 4921 }, { "epoch": 0.6261289912224908, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.890274047851562, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8559318780899048, "num_tokens": 187842791.0, "step": 4922 }, { "epoch": 0.6262562015010813, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.404611587524414, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8440970182418823, "num_tokens": 187878304.0, "step": 4923 }, { "epoch": 0.6263834117796718, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.67184829711914, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8560082912445068, "num_tokens": 187915022.0, "step": 4924 }, { "epoch": 0.6265106220582624, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.387042999267578, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8551912307739258, "num_tokens": 187950533.0, "step": 4925 }, { "epoch": 0.6266378323368528, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.714298248291016, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8509664535522461, "num_tokens": 187989893.0, "step": 4926 }, { "epoch": 0.6267650426154433, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.11625862121582, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8483537435531616, "num_tokens": 188025031.0, "step": 4927 }, { "epoch": 0.6268922528940338, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.72257423400879, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8470813035964966, "num_tokens": 188060797.0, "step": 4928 }, { "epoch": 0.6270194631726244, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.253103256225586, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8481351137161255, "num_tokens": 188096271.0, "step": 4929 }, { "epoch": 0.6271466734512149, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.51759910583496, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8560187220573425, "num_tokens": 188136697.0, "step": 4930 }, { "epoch": 0.6272738837298054, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.467487335205078, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8526018261909485, "num_tokens": 188174715.0, "step": 4931 }, { "epoch": 0.6274010940083958, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.59539222717285, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.833674430847168, "num_tokens": 188216379.0, "step": 4932 }, { "epoch": 0.6275283042869864, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.32653045654297, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8590198755264282, "num_tokens": 188252183.0, "step": 4933 }, { "epoch": 0.6276555145655769, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.29384994506836, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8564820289611816, "num_tokens": 188291128.0, "step": 4934 }, { "epoch": 0.6277827248441674, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.28200340270996, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8369807600975037, "num_tokens": 188334256.0, "step": 4935 }, { "epoch": 0.627909935122758, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.4753360748291, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8469952344894409, "num_tokens": 188373207.0, "step": 4936 }, { "epoch": 0.6280371454013485, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.412782669067383, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8504387736320496, "num_tokens": 188414937.0, "step": 4937 }, { "epoch": 0.6281643556799389, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.29292106628418, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.850237250328064, "num_tokens": 188455398.0, "step": 4938 }, { "epoch": 0.6282915659585294, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.249618530273438, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8536381721496582, "num_tokens": 188496561.0, "step": 4939 }, { "epoch": 0.62841877623712, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.460403442382812, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8579890727996826, "num_tokens": 188531318.0, "step": 4940 }, { "epoch": 0.6285459865157105, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.183015823364258, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8458794951438904, "num_tokens": 188566807.0, "step": 4941 }, { "epoch": 0.628673196794301, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.768735885620117, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8489326238632202, "num_tokens": 188608062.0, "step": 4942 }, { "epoch": 0.6288004070728915, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.316341400146484, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.864284336566925, "num_tokens": 188644953.0, "step": 4943 }, { "epoch": 0.628927617351482, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.512935638427734, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.869600772857666, "num_tokens": 188687720.0, "step": 4944 }, { "epoch": 0.6290548276300725, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.28958511352539, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.847253680229187, "num_tokens": 188730077.0, "step": 4945 }, { "epoch": 0.629182037908663, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.41568374633789, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.859744668006897, "num_tokens": 188768306.0, "step": 4946 }, { "epoch": 0.6293092481872535, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.27857780456543, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8336980938911438, "num_tokens": 188807480.0, "step": 4947 }, { "epoch": 0.6294364584658441, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.276514053344727, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8444188833236694, "num_tokens": 188836383.0, "step": 4948 }, { "epoch": 0.6295636687444346, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.303695678710938, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8570232391357422, "num_tokens": 188876079.0, "step": 4949 }, { "epoch": 0.629690879023025, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.03508949279785, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8499286770820618, "num_tokens": 188912936.0, "step": 4950 }, { "epoch": 0.6298180893016155, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.857227325439453, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8511455059051514, "num_tokens": 188951475.0, "step": 4951 }, { "epoch": 0.6299452995802061, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.021167755126953, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.840152382850647, "num_tokens": 188996972.0, "step": 4952 }, { "epoch": 0.6300725098587966, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.54636001586914, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8740025758743286, "num_tokens": 189030706.0, "step": 4953 }, { "epoch": 0.6301997201373871, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.772886276245117, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.876966655254364, "num_tokens": 189065638.0, "step": 4954 }, { "epoch": 0.6303269304159776, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.55764389038086, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8674114942550659, "num_tokens": 189101747.0, "step": 4955 }, { "epoch": 0.6304541406945681, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.867206573486328, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8511708378791809, "num_tokens": 189142965.0, "step": 4956 }, { "epoch": 0.6305813509731586, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.770288467407227, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8603255748748779, "num_tokens": 189177822.0, "step": 4957 }, { "epoch": 0.6307085612517491, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.602340698242188, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8473721742630005, "num_tokens": 189215710.0, "step": 4958 }, { "epoch": 0.6308357715303397, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.826255798339844, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8264455795288086, "num_tokens": 189250788.0, "step": 4959 }, { "epoch": 0.6309629818089302, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.41547203063965, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.851836621761322, "num_tokens": 189278451.0, "step": 4960 }, { "epoch": 0.6310901920875207, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.408432006835938, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8573217391967773, "num_tokens": 189316857.0, "step": 4961 }, { "epoch": 0.6312174023661112, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.510589599609375, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8664608597755432, "num_tokens": 189356600.0, "step": 4962 }, { "epoch": 0.6313446126447017, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.82552146911621, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8488022685050964, "num_tokens": 189393134.0, "step": 4963 }, { "epoch": 0.6314718229232922, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.483652114868164, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8390146493911743, "num_tokens": 189433431.0, "step": 4964 }, { "epoch": 0.6315990332018827, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.590652465820312, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8393615484237671, "num_tokens": 189473738.0, "step": 4965 }, { "epoch": 0.6317262434804732, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.741762161254883, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.868432879447937, "num_tokens": 189509930.0, "step": 4966 }, { "epoch": 0.6318534537590638, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.480661392211914, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8596434593200684, "num_tokens": 189547801.0, "step": 4967 }, { "epoch": 0.6319806640376543, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.611377716064453, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8681343197822571, "num_tokens": 189582226.0, "step": 4968 }, { "epoch": 0.6321078743162447, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.50564193725586, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8538589477539062, "num_tokens": 189622915.0, "step": 4969 }, { "epoch": 0.6322350845948352, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 30.16084861755371, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8621401786804199, "num_tokens": 189658182.0, "step": 4970 }, { "epoch": 0.6323622948734258, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.636415481567383, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8569401502609253, "num_tokens": 189696627.0, "step": 4971 }, { "epoch": 0.6324895051520163, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.94295310974121, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8525012731552124, "num_tokens": 189729826.0, "step": 4972 }, { "epoch": 0.6326167154306068, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.45614242553711, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.860593318939209, "num_tokens": 189771504.0, "step": 4973 }, { "epoch": 0.6327439257091974, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.14250946044922, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8712072372436523, "num_tokens": 189809220.0, "step": 4974 }, { "epoch": 0.6328711359877878, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.440534591674805, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8626154661178589, "num_tokens": 189849586.0, "step": 4975 }, { "epoch": 0.6329983462663783, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.9727725982666, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8569002151489258, "num_tokens": 189885054.0, "step": 4976 }, { "epoch": 0.6331255565449688, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.733383178710938, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8478395342826843, "num_tokens": 189918286.0, "step": 4977 }, { "epoch": 0.6332527668235594, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.511123657226562, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8351854681968689, "num_tokens": 189956845.0, "step": 4978 }, { "epoch": 0.6333799771021499, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.906429290771484, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8710036277770996, "num_tokens": 189994827.0, "step": 4979 }, { "epoch": 0.6335071873807404, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.725671768188477, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8500309586524963, "num_tokens": 190036066.0, "step": 4980 }, { "epoch": 0.6336343976593308, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.85370445251465, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8656128644943237, "num_tokens": 190073326.0, "step": 4981 }, { "epoch": 0.6337616079379214, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7220458984375e-05, "grad_norm": 29.793914794921875, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8523424863815308, "num_tokens": 190109125.0, "step": 4982 }, { "epoch": 0.6338888182165119, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.877206802368164, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8613251447677612, "num_tokens": 190148520.0, "step": 4983 }, { "epoch": 0.6340160284951024, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.737836837768555, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8578389286994934, "num_tokens": 190184272.0, "step": 4984 }, { "epoch": 0.6341432387736929, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.423776626586914, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8547569513320923, "num_tokens": 190221483.0, "step": 4985 }, { "epoch": 0.6342704490522835, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.615419387817383, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8670696020126343, "num_tokens": 190266430.0, "step": 4986 }, { "epoch": 0.6343976593308739, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.590595245361328, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8662375807762146, "num_tokens": 190308575.0, "step": 4987 }, { "epoch": 0.6345248696094644, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.95596694946289, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8641681671142578, "num_tokens": 190344584.0, "step": 4988 }, { "epoch": 0.6346520798880549, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.651561737060547, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8619403839111328, "num_tokens": 190376497.0, "step": 4989 }, { "epoch": 0.6347792901666455, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.896480560302734, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.843932032585144, "num_tokens": 190416879.0, "step": 4990 }, { "epoch": 0.634906500445236, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.492822647094727, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8751024603843689, "num_tokens": 190454684.0, "step": 4991 }, { "epoch": 0.6350337107238265, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.865707397460938, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8487482070922852, "num_tokens": 190490524.0, "step": 4992 }, { "epoch": 0.635160921002417, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.712488174438477, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8620464205741882, "num_tokens": 190526558.0, "step": 4993 }, { "epoch": 0.6352881312810075, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.630327224731445, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8556616306304932, "num_tokens": 190571206.0, "step": 4994 }, { "epoch": 0.635415341559598, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.66107177734375, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8441378474235535, "num_tokens": 190611480.0, "step": 4995 }, { "epoch": 0.6355425518381885, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.56393051147461, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8276541233062744, "num_tokens": 190647699.0, "step": 4996 }, { "epoch": 0.6356697621167791, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.585386276245117, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8557288646697998, "num_tokens": 190684863.0, "step": 4997 }, { "epoch": 0.6357969723953696, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.497360229492188, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8395829200744629, "num_tokens": 190725006.0, "step": 4998 }, { "epoch": 0.63592418267396, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.46753692626953, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8537861704826355, "num_tokens": 190757431.0, "step": 4999 }, { "epoch": 0.6360513929525505, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.83926010131836, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8591961860656738, "num_tokens": 190793118.0, "step": 5000 }, { "epoch": 0.6361786032311411, "ewc_loss": 0.07177734375, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.745887756347656e-05, "grad_norm": 29.414630889892578, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8640516996383667, "num_tokens": 190837105.0, "step": 5001 }, { "epoch": 0.6363058135097316, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.77140235900879, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8496103882789612, "num_tokens": 190875372.0, "step": 5002 }, { "epoch": 0.6364330237883221, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.470613479614258, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8602443933486938, "num_tokens": 190921307.0, "step": 5003 }, { "epoch": 0.6365602340669126, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.4121036529541, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8542478680610657, "num_tokens": 190967919.0, "step": 5004 }, { "epoch": 0.6366874443455031, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.63471031188965, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8538817763328552, "num_tokens": 191005668.0, "step": 5005 }, { "epoch": 0.6368146546240936, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.50421905517578, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8445900082588196, "num_tokens": 191044455.0, "step": 5006 }, { "epoch": 0.6369418649026841, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.82408905029297, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8595327734947205, "num_tokens": 191086369.0, "step": 5007 }, { "epoch": 0.6370690751812746, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.265722274780273, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8506597280502319, "num_tokens": 191127070.0, "step": 5008 }, { "epoch": 0.6371962854598652, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.943246841430664, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.869134247303009, "num_tokens": 191163623.0, "step": 5009 }, { "epoch": 0.6373234957384557, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.216899871826172, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8500359654426575, "num_tokens": 191196790.0, "step": 5010 }, { "epoch": 0.6374507060170462, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.1518497467041, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8521310091018677, "num_tokens": 191230265.0, "step": 5011 }, { "epoch": 0.6375779162956366, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.27993392944336, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8465937376022339, "num_tokens": 191266781.0, "step": 5012 }, { "epoch": 0.6377051265742272, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.053354263305664, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8446160554885864, "num_tokens": 191305869.0, "step": 5013 }, { "epoch": 0.6378323368528177, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.574134826660156, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8430122137069702, "num_tokens": 191343498.0, "step": 5014 }, { "epoch": 0.6379595471314082, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.401329040527344, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8401869535446167, "num_tokens": 191383260.0, "step": 5015 }, { "epoch": 0.6380867574099988, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.575586318969727, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8466247320175171, "num_tokens": 191417350.0, "step": 5016 }, { "epoch": 0.6382139676885893, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.397079467773438, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8558944463729858, "num_tokens": 191454336.0, "step": 5017 }, { "epoch": 0.6383411779671797, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 29.54987335205078, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8561520576477051, "num_tokens": 191489977.0, "step": 5018 }, { "epoch": 0.6384683882457702, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.63807487487793, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8442206382751465, "num_tokens": 191524622.0, "step": 5019 }, { "epoch": 0.6385955985243608, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.506118774414062, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.844154417514801, "num_tokens": 191565389.0, "step": 5020 }, { "epoch": 0.6387228088029513, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.072269439697266, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8735460638999939, "num_tokens": 191604640.0, "step": 5021 }, { "epoch": 0.6388500190815418, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.53371810913086, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8537104725837708, "num_tokens": 191651029.0, "step": 5022 }, { "epoch": 0.6389772293601323, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.893470764160156, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8560880422592163, "num_tokens": 191690524.0, "step": 5023 }, { "epoch": 0.6391044396387228, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.895877838134766, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8506796956062317, "num_tokens": 191733992.0, "step": 5024 }, { "epoch": 0.6392316499173133, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.92534065246582, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8651509284973145, "num_tokens": 191769717.0, "step": 5025 }, { "epoch": 0.6393588601959038, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.523271560668945, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8390778303146362, "num_tokens": 191813334.0, "step": 5026 }, { "epoch": 0.6394860704744944, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.132122039794922, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8572478890419006, "num_tokens": 191848295.0, "step": 5027 }, { "epoch": 0.6396132807530849, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.623952865600586, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8467521071434021, "num_tokens": 191885329.0, "step": 5028 }, { "epoch": 0.6397404910316754, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.10334014892578, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8681648969650269, "num_tokens": 191918070.0, "step": 5029 }, { "epoch": 0.6398677013102658, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.446575164794922, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8582214117050171, "num_tokens": 191966084.0, "step": 5030 }, { "epoch": 0.6399949115888564, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.10807991027832, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8537386059761047, "num_tokens": 192004907.0, "step": 5031 }, { "epoch": 0.6401221218674469, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 30.07548713684082, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8377996683120728, "num_tokens": 192042942.0, "step": 5032 }, { "epoch": 0.6402493321460374, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.537275314331055, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8546668291091919, "num_tokens": 192081133.0, "step": 5033 }, { "epoch": 0.6403765424246279, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.75306510925293, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8549754619598389, "num_tokens": 192120171.0, "step": 5034 }, { "epoch": 0.6405037527032185, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.937503814697266, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8550781607627869, "num_tokens": 192159583.0, "step": 5035 }, { "epoch": 0.6406309629818089, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.070466995239258, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8523876667022705, "num_tokens": 192199513.0, "step": 5036 }, { "epoch": 0.6407581732603994, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 29.68622398376465, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8570044636726379, "num_tokens": 192228352.0, "step": 5037 }, { "epoch": 0.6408853835389899, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.250511169433594, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8489705324172974, "num_tokens": 192264480.0, "step": 5038 }, { "epoch": 0.6410125938175805, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.8050537109375, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8543145656585693, "num_tokens": 192303307.0, "step": 5039 }, { "epoch": 0.641139804096171, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.41871452331543, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8567836880683899, "num_tokens": 192342064.0, "step": 5040 }, { "epoch": 0.6412670143747615, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.996932983398438, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8615860939025879, "num_tokens": 192379297.0, "step": 5041 }, { "epoch": 0.6413942246533519, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.852737426757812, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8584473133087158, "num_tokens": 192416087.0, "step": 5042 }, { "epoch": 0.6415214349319425, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 30.34576416015625, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8597148656845093, "num_tokens": 192454421.0, "step": 5043 }, { "epoch": 0.641648645210533, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.740753173828125, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8516375422477722, "num_tokens": 192493860.0, "step": 5044 }, { "epoch": 0.6417758554891235, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.19833755493164, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8450596332550049, "num_tokens": 192533672.0, "step": 5045 }, { "epoch": 0.641903065767714, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.754188537597656, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8426104187965393, "num_tokens": 192569569.0, "step": 5046 }, { "epoch": 0.6420302760463046, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.869525909423828, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8607141375541687, "num_tokens": 192610716.0, "step": 5047 }, { "epoch": 0.642157486324895, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.43931007385254, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8480379581451416, "num_tokens": 192648040.0, "step": 5048 }, { "epoch": 0.6422846966034855, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.720895767211914, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8528428077697754, "num_tokens": 192690063.0, "step": 5049 }, { "epoch": 0.6424119068820761, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.387651443481445, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8499877452850342, "num_tokens": 192732067.0, "step": 5050 }, { "epoch": 0.6425391171606666, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.933074951171875, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8645487427711487, "num_tokens": 192768598.0, "step": 5051 }, { "epoch": 0.6426663274392571, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.2952823638916, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8256174325942993, "num_tokens": 192805420.0, "step": 5052 }, { "epoch": 0.6427935377178476, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.96475601196289, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8491106033325195, "num_tokens": 192852082.0, "step": 5053 }, { "epoch": 0.6429207479964381, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.3881893157959, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8550769090652466, "num_tokens": 192892846.0, "step": 5054 }, { "epoch": 0.6430479582750286, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.53811264038086, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8719730973243713, "num_tokens": 192932091.0, "step": 5055 }, { "epoch": 0.6431751685536191, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4722347259521484e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 30.126964569091797, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8561173677444458, "num_tokens": 192960628.0, "step": 5056 }, { "epoch": 0.6433023788322096, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.7183837890625, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8483178615570068, "num_tokens": 193002075.0, "step": 5057 }, { "epoch": 0.6434295891108002, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 30.09976577758789, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8437395095825195, "num_tokens": 193039720.0, "step": 5058 }, { "epoch": 0.6435567993893907, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.487905502319336, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8540530204772949, "num_tokens": 193072806.0, "step": 5059 }, { "epoch": 0.6436840096679812, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 30.111026763916016, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8640245199203491, "num_tokens": 193108478.0, "step": 5060 }, { "epoch": 0.6438112199465716, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.656024932861328, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8571823239326477, "num_tokens": 193150663.0, "step": 5061 }, { "epoch": 0.6439384302251622, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.793571472167969e-05, "grad_norm": 29.726856231689453, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8575653433799744, "num_tokens": 193191499.0, "step": 5062 }, { "epoch": 0.6440656405037527, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.82552719116211, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8453649282455444, "num_tokens": 193236103.0, "step": 5063 }, { "epoch": 0.6441928507823432, "ewc_loss": 0.072265625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.7697296142578125e-05, "grad_norm": 29.804540634155273, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8331160545349121, "num_tokens": 193274095.0, "step": 5064 }, { "epoch": 0.6443200610609338, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.662151336669922, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8606452941894531, "num_tokens": 193316988.0, "step": 5065 }, { "epoch": 0.6444472713395243, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 30.13692283630371, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.85285484790802, "num_tokens": 193357430.0, "step": 5066 }, { "epoch": 0.6445744816181147, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.43619155883789, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8624127507209778, "num_tokens": 193390392.0, "step": 5067 }, { "epoch": 0.6447016918967052, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 30.039485931396484, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.85181725025177, "num_tokens": 193430961.0, "step": 5068 }, { "epoch": 0.6448289021752958, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.24644660949707, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8524179458618164, "num_tokens": 193473767.0, "step": 5069 }, { "epoch": 0.6449561124538863, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.09273910522461, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8427721261978149, "num_tokens": 193511375.0, "step": 5070 }, { "epoch": 0.6450833227324768, "ewc_loss": 0.07275390625, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.817413330078125e-05, "grad_norm": 29.81315040588379, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8651798963546753, "num_tokens": 193551296.0, "step": 5071 }, { "epoch": 0.6452105330110673, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.19053840637207, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8577841520309448, "num_tokens": 193596484.0, "step": 5072 }, { "epoch": 0.6453377432896578, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.063613891601562, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8608957529067993, "num_tokens": 193637513.0, "step": 5073 }, { "epoch": 0.6454649535682483, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.27375030517578, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8495668172836304, "num_tokens": 193672808.0, "step": 5074 }, { "epoch": 0.6455921638468388, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.006492614746094, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8703309893608093, "num_tokens": 193713018.0, "step": 5075 }, { "epoch": 0.6457193741254293, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.358259201049805, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8574939370155334, "num_tokens": 193757649.0, "step": 5076 }, { "epoch": 0.6458465844040199, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.963699340820312, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8513777256011963, "num_tokens": 193792390.0, "step": 5077 }, { "epoch": 0.6459737946826104, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.94585609436035, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8625351190567017, "num_tokens": 193827721.0, "step": 5078 }, { "epoch": 0.6461010049612008, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.13203239440918, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.844673752784729, "num_tokens": 193867217.0, "step": 5079 }, { "epoch": 0.6462282152397913, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.221384048461914, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8616994023323059, "num_tokens": 193900128.0, "step": 5080 }, { "epoch": 0.6463554255183819, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.902685165405273, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8670561909675598, "num_tokens": 193935744.0, "step": 5081 }, { "epoch": 0.6464826357969724, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.83223533630371, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8514987826347351, "num_tokens": 193977344.0, "step": 5082 }, { "epoch": 0.6466098460755629, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.176227569580078, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8266412615776062, "num_tokens": 194014295.0, "step": 5083 }, { "epoch": 0.6467370563541535, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.70793914794922, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8380387425422668, "num_tokens": 194055245.0, "step": 5084 }, { "epoch": 0.6468642666327439, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.156055450439453, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8480892777442932, "num_tokens": 194089313.0, "step": 5085 }, { "epoch": 0.6469914769113344, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.287628173828125, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8624622225761414, "num_tokens": 194128293.0, "step": 5086 }, { "epoch": 0.6471186871899249, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.1036376953125, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8474103808403015, "num_tokens": 194166632.0, "step": 5087 }, { "epoch": 0.6472458974685155, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.307689666748047, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8406755328178406, "num_tokens": 194209176.0, "step": 5088 }, { "epoch": 0.647373107747106, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.07139778137207, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8445569276809692, "num_tokens": 194245617.0, "step": 5089 }, { "epoch": 0.6475003180256965, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.244539260864258, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8597817420959473, "num_tokens": 194285983.0, "step": 5090 }, { "epoch": 0.6476275283042869, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.046754837036133, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8498905897140503, "num_tokens": 194317088.0, "step": 5091 }, { "epoch": 0.6477547385828775, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.036788940429688, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8589912056922913, "num_tokens": 194354025.0, "step": 5092 }, { "epoch": 0.647881948861468, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.038232803344727, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8583147525787354, "num_tokens": 194388539.0, "step": 5093 }, { "epoch": 0.6480091591400585, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.45464324951172, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8543843030929565, "num_tokens": 194424518.0, "step": 5094 }, { "epoch": 0.648136369418649, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.838275909423828, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8505830764770508, "num_tokens": 194463310.0, "step": 5095 }, { "epoch": 0.6482635796972396, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.440235137939453, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8537123799324036, "num_tokens": 194499911.0, "step": 5096 }, { "epoch": 0.64839078997583, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 29.975242614746094, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8452799320220947, "num_tokens": 194536731.0, "step": 5097 }, { "epoch": 0.6485180002544205, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.441646575927734, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.843423068523407, "num_tokens": 194574014.0, "step": 5098 }, { "epoch": 0.648645210533011, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.120391845703125, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8461773991584778, "num_tokens": 194611944.0, "step": 5099 }, { "epoch": 0.6487724208116016, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.31688690185547, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8644061088562012, "num_tokens": 194649734.0, "step": 5100 }, { "epoch": 0.6488996310901921, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.926334381103516, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.862532913684845, "num_tokens": 194688344.0, "step": 5101 }, { "epoch": 0.6490268413687826, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.604515075683594, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8405723571777344, "num_tokens": 194728582.0, "step": 5102 }, { "epoch": 0.649154051647373, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.001728057861328, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8553479313850403, "num_tokens": 194762753.0, "step": 5103 }, { "epoch": 0.6492812619259636, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.450326919555664, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8662001490592957, "num_tokens": 194802325.0, "step": 5104 }, { "epoch": 0.6494084722045541, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.63728141784668, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8532117605209351, "num_tokens": 194846349.0, "step": 5105 }, { "epoch": 0.6495356824831446, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.6007022857666, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.855200469493866, "num_tokens": 194880334.0, "step": 5106 }, { "epoch": 0.6496628927617352, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.905582427978516, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8456352949142456, "num_tokens": 194919343.0, "step": 5107 }, { "epoch": 0.6497901030403257, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.04062271118164, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8554564714431763, "num_tokens": 194958800.0, "step": 5108 }, { "epoch": 0.6499173133189162, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 30.30404281616211, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8277755975723267, "num_tokens": 194997249.0, "step": 5109 }, { "epoch": 0.6500445235975066, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.8650970458984375e-05, "grad_norm": 30.16745376586914, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8568242192268372, "num_tokens": 195035879.0, "step": 5110 }, { "epoch": 0.6501717338760972, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.67293930053711, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8498241901397705, "num_tokens": 195075925.0, "step": 5111 }, { "epoch": 0.6502989441546877, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.645246505737305, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8560864925384521, "num_tokens": 195110185.0, "step": 5112 }, { "epoch": 0.6504261544332782, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.55630874633789, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.847641110420227, "num_tokens": 195154666.0, "step": 5113 }, { "epoch": 0.6505533647118688, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 29.587871551513672, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8664634227752686, "num_tokens": 195188998.0, "step": 5114 }, { "epoch": 0.6506805749904593, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.633312225341797, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8609333038330078, "num_tokens": 195224124.0, "step": 5115 }, { "epoch": 0.6508077852690497, "ewc_loss": 0.0732421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.841255187988281e-05, "grad_norm": 29.60906410217285, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8558363914489746, "num_tokens": 195253601.0, "step": 5116 }, { "epoch": 0.6509349955476402, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.557586669921875, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8456325531005859, "num_tokens": 195289045.0, "step": 5117 }, { "epoch": 0.6510622058262308, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.164453506469727, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8551598787307739, "num_tokens": 195326983.0, "step": 5118 }, { "epoch": 0.6511894161048213, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.322546005249023, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8648252487182617, "num_tokens": 195360425.0, "step": 5119 }, { "epoch": 0.6513166263834118, "ewc_loss": 0.07373046875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.888938903808594e-05, "grad_norm": 30.174997329711914, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8432058095932007, "num_tokens": 195402648.0, "step": 5120 }, { "epoch": 0.6514438366620023, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.215850830078125, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8714936971664429, "num_tokens": 195440945.0, "step": 5121 }, { "epoch": 0.6515710469405928, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.7370548248291, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8568142652511597, "num_tokens": 195481872.0, "step": 5122 }, { "epoch": 0.6516982572191833, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.357587814331055, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8603657484054565, "num_tokens": 195525460.0, "step": 5123 }, { "epoch": 0.6518254674977738, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 29.905540466308594, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8563984036445618, "num_tokens": 195563522.0, "step": 5124 }, { "epoch": 0.6519526777763643, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.1828556060791, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8642596006393433, "num_tokens": 195598114.0, "step": 5125 }, { "epoch": 0.6520798880549549, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.758596420288086, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8498919010162354, "num_tokens": 195639826.0, "step": 5126 }, { "epoch": 0.6522070983335454, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.08234214782715, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8619235157966614, "num_tokens": 195682047.0, "step": 5127 }, { "epoch": 0.6523343086121358, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.389970779418945, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8580026626586914, "num_tokens": 195722453.0, "step": 5128 }, { "epoch": 0.6524615188907263, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.087615966796875, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8501408100128174, "num_tokens": 195759182.0, "step": 5129 }, { "epoch": 0.6525887291693169, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.528865814208984, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8567259311676025, "num_tokens": 195795641.0, "step": 5130 }, { "epoch": 0.6527159394479074, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.936622619628906e-05, "grad_norm": 29.900705337524414, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8638746738433838, "num_tokens": 195831200.0, "step": 5131 }, { "epoch": 0.6528431497264979, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.163419723510742, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.857101559638977, "num_tokens": 195875110.0, "step": 5132 }, { "epoch": 0.6529703600050885, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.533735275268555, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8567638397216797, "num_tokens": 195916724.0, "step": 5133 }, { "epoch": 0.6530975702836789, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.802021026611328, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8490887880325317, "num_tokens": 195956219.0, "step": 5134 }, { "epoch": 0.6532247805622694, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.681659698486328, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8587477803230286, "num_tokens": 195991563.0, "step": 5135 }, { "epoch": 0.6533519908408599, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.11515235900879, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8614712953567505, "num_tokens": 196031864.0, "step": 5136 }, { "epoch": 0.6534792011194505, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.247596740722656, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8468016386032104, "num_tokens": 196067208.0, "step": 5137 }, { "epoch": 0.653606411398041, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.91278076171875e-05, "grad_norm": 30.28187370300293, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8585942983627319, "num_tokens": 196103934.0, "step": 5138 }, { "epoch": 0.6537336216766315, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.151437759399414, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8506770133972168, "num_tokens": 196140886.0, "step": 5139 }, { "epoch": 0.6538608319552219, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.40998077392578, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8637754917144775, "num_tokens": 196178940.0, "step": 5140 }, { "epoch": 0.6539880422338125, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.06810760498047, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8573789596557617, "num_tokens": 196229461.0, "step": 5141 }, { "epoch": 0.654115252512403, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.20306968688965, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8682985901832581, "num_tokens": 196265793.0, "step": 5142 }, { "epoch": 0.6542424627909935, "ewc_loss": 0.07421875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.9604644775390625e-05, "grad_norm": 30.088319778442383, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8615478277206421, "num_tokens": 196304919.0, "step": 5143 }, { "epoch": 0.654369673069584, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.269147872924805, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8693546652793884, "num_tokens": 196344761.0, "step": 5144 }, { "epoch": 0.6544968833481746, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.049837112426758, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.86915522813797, "num_tokens": 196382625.0, "step": 5145 }, { "epoch": 0.654624093626765, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.209144592285156, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8662201166152954, "num_tokens": 196422279.0, "step": 5146 }, { "epoch": 0.6547513039053555, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 30.145654678344727, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8510922193527222, "num_tokens": 196463684.0, "step": 5147 }, { "epoch": 0.654878514183946, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.254240036010742, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8596748113632202, "num_tokens": 196506319.0, "step": 5148 }, { "epoch": 0.6550057244625366, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.188875198364258, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8495166301727295, "num_tokens": 196546367.0, "step": 5149 }, { "epoch": 0.6551329347411271, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.570192337036133, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8390395045280457, "num_tokens": 196576598.0, "step": 5150 }, { "epoch": 0.6552601450197176, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.24138641357422, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8485814332962036, "num_tokens": 196610633.0, "step": 5151 }, { "epoch": 0.655387355298308, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4841556549072266e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.582841873168945, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8475976586341858, "num_tokens": 196646183.0, "step": 5152 }, { "epoch": 0.6555145655768986, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.436784744262695, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.864701509475708, "num_tokens": 196689454.0, "step": 5153 }, { "epoch": 0.6556417758554891, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.134737014770508, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8401293754577637, "num_tokens": 196731820.0, "step": 5154 }, { "epoch": 0.6557689861340796, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.31475830078125, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.846274733543396, "num_tokens": 196768542.0, "step": 5155 }, { "epoch": 0.6558961964126702, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 30.308639526367188, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8395150899887085, "num_tokens": 196810300.0, "step": 5156 }, { "epoch": 0.6560234066912607, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.24333381652832, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.838929295539856, "num_tokens": 196847271.0, "step": 5157 }, { "epoch": 0.6561506169698512, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.590457916259766, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8707109689712524, "num_tokens": 196880439.0, "step": 5158 }, { "epoch": 0.6562778272484416, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.905488967895508, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8465592861175537, "num_tokens": 196914911.0, "step": 5159 }, { "epoch": 0.6564050375270322, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.556102752685547, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8552629947662354, "num_tokens": 196949419.0, "step": 5160 }, { "epoch": 0.6565322478056227, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.783565521240234, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.847769558429718, "num_tokens": 196989401.0, "step": 5161 }, { "epoch": 0.6566594580842132, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.741230010986328, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8461904525756836, "num_tokens": 197028377.0, "step": 5162 }, { "epoch": 0.6567866683628037, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.715665817260742, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8339493274688721, "num_tokens": 197068334.0, "step": 5163 }, { "epoch": 0.6569138786413943, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.517112731933594, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8570767641067505, "num_tokens": 197105978.0, "step": 5164 }, { "epoch": 0.6570410889199847, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.008148193359375e-05, "grad_norm": 29.74892807006836, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8368205428123474, "num_tokens": 197148641.0, "step": 5165 }, { "epoch": 0.6571682991985752, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.73420524597168, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8359513878822327, "num_tokens": 197188820.0, "step": 5166 }, { "epoch": 0.6572955094771658, "ewc_loss": 0.07470703125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 5.984306335449219e-05, "grad_norm": 29.965723037719727, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8425230979919434, "num_tokens": 197227109.0, "step": 5167 }, { "epoch": 0.6574227197557563, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 29.945865631103516, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8606950044631958, "num_tokens": 197264083.0, "step": 5168 }, { "epoch": 0.6575499300343468, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.375370025634766, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8504387736320496, "num_tokens": 197301349.0, "step": 5169 }, { "epoch": 0.6576771403129373, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.10419273376465, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8433375954627991, "num_tokens": 197348176.0, "step": 5170 }, { "epoch": 0.6578043505915278, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.264768600463867, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8472517728805542, "num_tokens": 197380490.0, "step": 5171 }, { "epoch": 0.6579315608701183, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 6.079673767089844e-05, "grad_norm": 30.390905380249023, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8421217203140259, "num_tokens": 197423075.0, "step": 5172 }, { "epoch": 0.6580587711487088, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 29.88683319091797, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.857908308506012, "num_tokens": 197461730.0, "step": 5173 }, { "epoch": 0.6581859814272993, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.540767669677734, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8442007899284363, "num_tokens": 197505801.0, "step": 5174 }, { "epoch": 0.6583131917058899, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 29.774911880493164, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8711690902709961, "num_tokens": 197534643.0, "step": 5175 }, { "epoch": 0.6584404019844804, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.42146873474121, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.859380304813385, "num_tokens": 197576490.0, "step": 5176 }, { "epoch": 0.6585676122630708, "ewc_loss": 0.0751953125, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.031990051269531e-05, "grad_norm": 30.120222091674805, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8668849468231201, "num_tokens": 197612745.0, "step": 5177 }, { "epoch": 0.6586948225416613, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.095571517944336, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8498060703277588, "num_tokens": 197653756.0, "step": 5178 }, { "epoch": 0.6588220328202519, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 29.9395809173584, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8793864250183105, "num_tokens": 197694417.0, "step": 5179 }, { "epoch": 0.6589492430988424, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.238582611083984, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8497468829154968, "num_tokens": 197732738.0, "step": 5180 }, { "epoch": 0.6590764533774329, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.162841796875, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8441353440284729, "num_tokens": 197759408.0, "step": 5181 }, { "epoch": 0.6592036636560235, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.101425170898438, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8567402362823486, "num_tokens": 197794334.0, "step": 5182 }, { "epoch": 0.6593308739346139, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.218778610229492, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8634636402130127, "num_tokens": 197832497.0, "step": 5183 }, { "epoch": 0.6594580842132044, "ewc_loss": 0.07568359375, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.0558319091796875e-05, "grad_norm": 30.08097267150879, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.845712423324585, "num_tokens": 197869021.0, "step": 5184 }, { "epoch": 0.6595852944917949, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.637197494506836, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8405600190162659, "num_tokens": 197906365.0, "step": 5185 }, { "epoch": 0.6597125047703855, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.433921813964844, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8573616743087769, "num_tokens": 197950725.0, "step": 5186 }, { "epoch": 0.659839715048976, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 29.837846755981445, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8626153469085693, "num_tokens": 197991178.0, "step": 5187 }, { "epoch": 0.6599669253275665, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.266620635986328, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8447187542915344, "num_tokens": 198030176.0, "step": 5188 }, { "epoch": 0.6600941356061569, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 29.77562713623047, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8493511080741882, "num_tokens": 198067169.0, "step": 5189 }, { "epoch": 0.6602213458847475, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.114519119262695, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8443320393562317, "num_tokens": 198104994.0, "step": 5190 }, { "epoch": 0.660348556163338, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.393709182739258, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8591829538345337, "num_tokens": 198142791.0, "step": 5191 }, { "epoch": 0.6604757664419285, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.826406478881836, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8507223725318909, "num_tokens": 198179300.0, "step": 5192 }, { "epoch": 0.660602976720519, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.052309036254883, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8688633441925049, "num_tokens": 198220230.0, "step": 5193 }, { "epoch": 0.6607301869991096, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.016647338867188, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8423781991004944, "num_tokens": 198251067.0, "step": 5194 }, { "epoch": 0.6608573972777, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.90648651123047, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8628553152084351, "num_tokens": 198282130.0, "step": 5195 }, { "epoch": 0.6609846075562905, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.009628295898438, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8648303747177124, "num_tokens": 198325609.0, "step": 5196 }, { "epoch": 0.661111817834881, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.770490646362305, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8574169874191284, "num_tokens": 198365116.0, "step": 5197 }, { "epoch": 0.6612390281134716, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 29.896549224853516, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8549743294715881, "num_tokens": 198403467.0, "step": 5198 }, { "epoch": 0.6613662383920621, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.08222198486328, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8608300685882568, "num_tokens": 198440676.0, "step": 5199 }, { "epoch": 0.6614934486706526, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.078126907348633, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8570345044136047, "num_tokens": 198479082.0, "step": 5200 }, { "epoch": 0.661620658949243, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.911725997924805, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8508415818214417, "num_tokens": 198521741.0, "step": 5201 }, { "epoch": 0.6617478692278336, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.09395408630371, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8465355634689331, "num_tokens": 198562177.0, "step": 5202 }, { "epoch": 0.6618750795064241, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.000652313232422, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8549312353134155, "num_tokens": 198600726.0, "step": 5203 }, { "epoch": 0.6620022897850146, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.975269317626953, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8263719081878662, "num_tokens": 198638156.0, "step": 5204 }, { "epoch": 0.6621295000636052, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.15827751159668, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8353201150894165, "num_tokens": 198680312.0, "step": 5205 }, { "epoch": 0.6622567103421957, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.108699798583984, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.872466504573822, "num_tokens": 198723886.0, "step": 5206 }, { "epoch": 0.6623839206207861, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.07872200012207, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8545340299606323, "num_tokens": 198765331.0, "step": 5207 }, { "epoch": 0.6625111308993766, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.284841537475586, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8577756881713867, "num_tokens": 198802421.0, "step": 5208 }, { "epoch": 0.6626383411779672, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 29.8833065032959, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8647498488426208, "num_tokens": 198843335.0, "step": 5209 }, { "epoch": 0.6627655514565577, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.391090393066406, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8614672422409058, "num_tokens": 198880925.0, "step": 5210 }, { "epoch": 0.6628927617351482, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.0939884185791, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8532055020332336, "num_tokens": 198915448.0, "step": 5211 }, { "epoch": 0.6630199720137387, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.289533615112305, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8653774857521057, "num_tokens": 198952254.0, "step": 5212 }, { "epoch": 0.6631471822923293, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.518932342529297, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8592756986618042, "num_tokens": 198990841.0, "step": 5213 }, { "epoch": 0.6632743925709197, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.017826080322266, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8577619791030884, "num_tokens": 199029383.0, "step": 5214 }, { "epoch": 0.6634016028495102, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.569690704345703, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.855344295501709, "num_tokens": 199066731.0, "step": 5215 }, { "epoch": 0.6635288131281007, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.007823944091797, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8670987486839294, "num_tokens": 199108752.0, "step": 5216 }, { "epoch": 0.6636560234066913, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.187023162841797, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8582690954208374, "num_tokens": 199146438.0, "step": 5217 }, { "epoch": 0.6637832336852818, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.334674835205078, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8564056158065796, "num_tokens": 199186367.0, "step": 5218 }, { "epoch": 0.6639104439638723, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.352306365966797, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8684507608413696, "num_tokens": 199225785.0, "step": 5219 }, { "epoch": 0.6640376542424627, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.57109260559082, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8583815693855286, "num_tokens": 199262574.0, "step": 5220 }, { "epoch": 0.6641648645210533, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.574127197265625, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8599129319190979, "num_tokens": 199299093.0, "step": 5221 }, { "epoch": 0.6642920747996438, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.29292106628418, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8613871335983276, "num_tokens": 199337446.0, "step": 5222 }, { "epoch": 0.6644192850782343, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.66851234436035, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8570661544799805, "num_tokens": 199377321.0, "step": 5223 }, { "epoch": 0.6645464953568249, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.019229888916016, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8660513162612915, "num_tokens": 199413057.0, "step": 5224 }, { "epoch": 0.6646737056354154, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.77983856201172, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8611793518066406, "num_tokens": 199452730.0, "step": 5225 }, { "epoch": 0.6648009159140058, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.05830955505371, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8659051656723022, "num_tokens": 199487860.0, "step": 5226 }, { "epoch": 0.6649281261925963, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.643220901489258, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8630092144012451, "num_tokens": 199532790.0, "step": 5227 }, { "epoch": 0.6650553364711869, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.207866668701172, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8752282857894897, "num_tokens": 199561072.0, "step": 5228 }, { "epoch": 0.6651825467497774, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.380277633666992, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8619092702865601, "num_tokens": 199595820.0, "step": 5229 }, { "epoch": 0.6653097570283679, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.508142471313477, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8427178859710693, "num_tokens": 199640101.0, "step": 5230 }, { "epoch": 0.6654369673069584, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.370153427124023, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8515284061431885, "num_tokens": 199680287.0, "step": 5231 }, { "epoch": 0.6655641775855489, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.277647018432617, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.854935884475708, "num_tokens": 199721958.0, "step": 5232 }, { "epoch": 0.6656913878641394, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.53485107421875, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8612498641014099, "num_tokens": 199756021.0, "step": 5233 }, { "epoch": 0.6658185981427299, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.968055725097656, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8594120144844055, "num_tokens": 199793586.0, "step": 5234 }, { "epoch": 0.6659458084213205, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.937999725341797, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.842917799949646, "num_tokens": 199835728.0, "step": 5235 }, { "epoch": 0.666073018699911, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.933151245117188, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.864372730255127, "num_tokens": 199873348.0, "step": 5236 }, { "epoch": 0.6662002289785015, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.522314071655273, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8565208911895752, "num_tokens": 199908223.0, "step": 5237 }, { "epoch": 0.6663274392570919, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.134668350219727, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8518537282943726, "num_tokens": 199947017.0, "step": 5238 }, { "epoch": 0.6664546495356825, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.180038452148438, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8641220331192017, "num_tokens": 199984806.0, "step": 5239 }, { "epoch": 0.666581859814273, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.206161499023438, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8493177890777588, "num_tokens": 200022600.0, "step": 5240 }, { "epoch": 0.6667090700928635, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.60087013244629, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8661553263664246, "num_tokens": 200058407.0, "step": 5241 }, { "epoch": 0.666836280371454, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.90032196044922, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8285310864448547, "num_tokens": 200101871.0, "step": 5242 }, { "epoch": 0.6669634906500446, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.702224731445312, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.856410026550293, "num_tokens": 200138501.0, "step": 5243 }, { "epoch": 0.667090700928635, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.06114959716797, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8554565906524658, "num_tokens": 200176639.0, "step": 5244 }, { "epoch": 0.6672179112072255, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.518062591552734, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8505215048789978, "num_tokens": 200211873.0, "step": 5245 }, { "epoch": 0.667345121485816, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.05010414123535, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8510043025016785, "num_tokens": 200246596.0, "step": 5246 }, { "epoch": 0.6674723317644066, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.602760314941406, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8721009492874146, "num_tokens": 200282761.0, "step": 5247 }, { "epoch": 0.6675995420429971, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.20606803894043, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8399980068206787, "num_tokens": 200321821.0, "step": 5248 }, { "epoch": 0.6677267523215876, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.35723304748535, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8393129110336304, "num_tokens": 200369066.0, "step": 5249 }, { "epoch": 0.667853962600178, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.349773406982422, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8480501174926758, "num_tokens": 200408067.0, "step": 5250 }, { "epoch": 0.6679811728787686, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.6833553314209, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8731774091720581, "num_tokens": 200446693.0, "step": 5251 }, { "epoch": 0.6681083831573591, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.25042724609375, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.83609938621521, "num_tokens": 200485340.0, "step": 5252 }, { "epoch": 0.6682355934359496, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.77907943725586, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8528199195861816, "num_tokens": 200521978.0, "step": 5253 }, { "epoch": 0.6683628037145402, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.327497482299805, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8563283681869507, "num_tokens": 200560604.0, "step": 5254 }, { "epoch": 0.6684900139931307, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.522428512573242, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8517505526542664, "num_tokens": 200601718.0, "step": 5255 }, { "epoch": 0.6686172242717211, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.473636627197266, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8318756222724915, "num_tokens": 200646186.0, "step": 5256 }, { "epoch": 0.6687444345503116, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.383453369140625, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.858123779296875, "num_tokens": 200684849.0, "step": 5257 }, { "epoch": 0.6688716448289022, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.294452667236328, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8605435490608215, "num_tokens": 200719542.0, "step": 5258 }, { "epoch": 0.6689988551074927, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.44713592529297, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8574815988540649, "num_tokens": 200756212.0, "step": 5259 }, { "epoch": 0.6691260653860832, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.107933044433594, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8578476905822754, "num_tokens": 200791184.0, "step": 5260 }, { "epoch": 0.6692532756646737, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.305105209350586, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8382416367530823, "num_tokens": 200823966.0, "step": 5261 }, { "epoch": 0.6693804859432643, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.43451690673828, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8602092266082764, "num_tokens": 200862718.0, "step": 5262 }, { "epoch": 0.6695076962218547, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.3445987701416, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8500970005989075, "num_tokens": 200901755.0, "step": 5263 }, { "epoch": 0.6696349065004452, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.495418548583984, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8682620525360107, "num_tokens": 200935788.0, "step": 5264 }, { "epoch": 0.6697621167790357, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.94257354736328, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8702070116996765, "num_tokens": 200975159.0, "step": 5265 }, { "epoch": 0.6698893270576263, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.93522834777832, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.86739182472229, "num_tokens": 201010813.0, "step": 5266 }, { "epoch": 0.6700165373362168, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.96695899963379, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8533979058265686, "num_tokens": 201051937.0, "step": 5267 }, { "epoch": 0.6701437476148073, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.63581085205078, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8622052669525146, "num_tokens": 201090926.0, "step": 5268 }, { "epoch": 0.6702709578933977, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.03704833984375, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8352433443069458, "num_tokens": 201126086.0, "step": 5269 }, { "epoch": 0.6703981681719883, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.407941818237305, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.84498131275177, "num_tokens": 201165933.0, "step": 5270 }, { "epoch": 0.6705253784505788, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.276500701904297, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8588447570800781, "num_tokens": 201198794.0, "step": 5271 }, { "epoch": 0.6706525887291693, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.225765228271484, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8464664816856384, "num_tokens": 201234081.0, "step": 5272 }, { "epoch": 0.6707797990077599, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.846036911010742, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8518736362457275, "num_tokens": 201271796.0, "step": 5273 }, { "epoch": 0.6709070092863504, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.223270416259766, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8373827934265137, "num_tokens": 201309526.0, "step": 5274 }, { "epoch": 0.6710342195649408, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.753944396972656, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8616722822189331, "num_tokens": 201344015.0, "step": 5275 }, { "epoch": 0.6711614298435313, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.234607696533203, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8559174537658691, "num_tokens": 201385201.0, "step": 5276 }, { "epoch": 0.6712886401221219, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.638235092163086, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8610145449638367, "num_tokens": 201422370.0, "step": 5277 }, { "epoch": 0.6714158504007124, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.212379455566406, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8406676054000854, "num_tokens": 201467085.0, "step": 5278 }, { "epoch": 0.6715430606793029, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.487051010131836, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8696689009666443, "num_tokens": 201503432.0, "step": 5279 }, { "epoch": 0.6716702709578934, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.103073120117188, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8619812726974487, "num_tokens": 201541358.0, "step": 5280 }, { "epoch": 0.6717974812364839, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.738174438476562, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8522251844406128, "num_tokens": 201580235.0, "step": 5281 }, { "epoch": 0.6719246915150744, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.266080856323242, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8562501668930054, "num_tokens": 201627703.0, "step": 5282 }, { "epoch": 0.6720519017936649, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.842531204223633, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8564419746398926, "num_tokens": 201667182.0, "step": 5283 }, { "epoch": 0.6721791120722554, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.298913955688477, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8571218848228455, "num_tokens": 201699876.0, "step": 5284 }, { "epoch": 0.672306322350846, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.397258758544922, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8439123630523682, "num_tokens": 201732770.0, "step": 5285 }, { "epoch": 0.6724335326294365, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.204532623291016, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.85793137550354, "num_tokens": 201771388.0, "step": 5286 }, { "epoch": 0.6725607429080269, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.310091018676758, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8499525785446167, "num_tokens": 201809283.0, "step": 5287 }, { "epoch": 0.6726879531866174, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.33883285522461, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8620716333389282, "num_tokens": 201854950.0, "step": 5288 }, { "epoch": 0.672815163465208, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.363840103149414, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.859237790107727, "num_tokens": 201895638.0, "step": 5289 }, { "epoch": 0.6729423737437985, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.389799118041992, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8414450287818909, "num_tokens": 201928243.0, "step": 5290 }, { "epoch": 0.673069584022389, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.087175369262695, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8546637296676636, "num_tokens": 201964687.0, "step": 5291 }, { "epoch": 0.6731967943009796, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.413936614990234, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8683235049247742, "num_tokens": 201997166.0, "step": 5292 }, { "epoch": 0.67332400457957, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.14337158203125, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8577943444252014, "num_tokens": 202030771.0, "step": 5293 }, { "epoch": 0.6734512148581605, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.313488006591797, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8485137224197388, "num_tokens": 202077280.0, "step": 5294 }, { "epoch": 0.673578425136751, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.058704376220703, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8474966287612915, "num_tokens": 202115273.0, "step": 5295 }, { "epoch": 0.6737056354153416, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.347503662109375, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8658701777458191, "num_tokens": 202145347.0, "step": 5296 }, { "epoch": 0.6738328456939321, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.16729164123535, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8605443835258484, "num_tokens": 202184621.0, "step": 5297 }, { "epoch": 0.6739600559725226, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.330896377563477, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8589581847190857, "num_tokens": 202230099.0, "step": 5298 }, { "epoch": 0.674087266251113, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.98832893371582, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8483014106750488, "num_tokens": 202267872.0, "step": 5299 }, { "epoch": 0.6742144765297036, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5020370483398438e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.5482120513916, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8536513447761536, "num_tokens": 202305517.0, "step": 5300 }, { "epoch": 0.6743416868082941, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.40594482421875, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8406514525413513, "num_tokens": 202340671.0, "step": 5301 }, { "epoch": 0.6744688970868846, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.22621726989746, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8631723523139954, "num_tokens": 202379552.0, "step": 5302 }, { "epoch": 0.6745961073654752, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.184999465942383, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8493978977203369, "num_tokens": 202421404.0, "step": 5303 }, { "epoch": 0.6747233176440657, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.290767669677734, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8411375284194946, "num_tokens": 202463765.0, "step": 5304 }, { "epoch": 0.6748505279226561, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.189407348632812, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8508504629135132, "num_tokens": 202505872.0, "step": 5305 }, { "epoch": 0.6749777382012466, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.169429779052734, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8528673052787781, "num_tokens": 202544085.0, "step": 5306 }, { "epoch": 0.6751049484798372, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.440088272094727, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8519877195358276, "num_tokens": 202588237.0, "step": 5307 }, { "epoch": 0.6752321587584277, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.188228607177734, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.849622368812561, "num_tokens": 202628462.0, "step": 5308 }, { "epoch": 0.6753593690370182, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.552200317382812, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8583345413208008, "num_tokens": 202668149.0, "step": 5309 }, { "epoch": 0.6754865793156087, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 29.98490333557129, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8705716133117676, "num_tokens": 202708651.0, "step": 5310 }, { "epoch": 0.6756137895941993, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.642616271972656, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8611671924591064, "num_tokens": 202738613.0, "step": 5311 }, { "epoch": 0.6757409998727897, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.268596649169922, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.851359486579895, "num_tokens": 202779279.0, "step": 5312 }, { "epoch": 0.6758682101513802, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.69951057434082, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8555504083633423, "num_tokens": 202812095.0, "step": 5313 }, { "epoch": 0.6759954204299707, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.0141544342041, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8512359857559204, "num_tokens": 202855705.0, "step": 5314 }, { "epoch": 0.6761226307085613, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.372922897338867, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8380404710769653, "num_tokens": 202894180.0, "step": 5315 }, { "epoch": 0.6762498409871518, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.3104190826416, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8620541095733643, "num_tokens": 202925842.0, "step": 5316 }, { "epoch": 0.6763770512657423, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.38960838317871, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8615649938583374, "num_tokens": 202959103.0, "step": 5317 }, { "epoch": 0.6765042615443327, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.32866668701172, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8548474311828613, "num_tokens": 203000393.0, "step": 5318 }, { "epoch": 0.6766314718229233, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.32875633239746, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8569153547286987, "num_tokens": 203041613.0, "step": 5319 }, { "epoch": 0.6767586821015138, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.468048095703125, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8710520267486572, "num_tokens": 203073744.0, "step": 5320 }, { "epoch": 0.6768858923801043, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.480501174926758, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8397635817527771, "num_tokens": 203108350.0, "step": 5321 }, { "epoch": 0.6770131026586949, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.598838806152344, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8588154315948486, "num_tokens": 203139149.0, "step": 5322 }, { "epoch": 0.6771403129372854, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.429227828979492, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8361057043075562, "num_tokens": 203173953.0, "step": 5323 }, { "epoch": 0.6772675232158758, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.32468605041504, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8646538257598877, "num_tokens": 203208915.0, "step": 5324 }, { "epoch": 0.6773947334944663, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.360370635986328, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8431134223937988, "num_tokens": 203251906.0, "step": 5325 }, { "epoch": 0.6775219437730569, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.510772705078125, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8574048280715942, "num_tokens": 203289460.0, "step": 5326 }, { "epoch": 0.6776491540516474, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.21282386779785, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8637518882751465, "num_tokens": 203325647.0, "step": 5327 }, { "epoch": 0.6777763643302379, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.545536041259766, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8507472276687622, "num_tokens": 203364972.0, "step": 5328 }, { "epoch": 0.6779035746088284, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.284147262573242, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8558669090270996, "num_tokens": 203407778.0, "step": 5329 }, { "epoch": 0.6780307848874189, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.51611328125, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8567800521850586, "num_tokens": 203443820.0, "step": 5330 }, { "epoch": 0.6781579951660094, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.431106567382812, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8604409694671631, "num_tokens": 203475742.0, "step": 5331 }, { "epoch": 0.6782852054445999, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.605012893676758, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8308287858963013, "num_tokens": 203512617.0, "step": 5332 }, { "epoch": 0.6784124157231904, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.508787155151367, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8393386006355286, "num_tokens": 203546778.0, "step": 5333 }, { "epoch": 0.678539626001781, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.920413970947266, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8564671277999878, "num_tokens": 203585222.0, "step": 5334 }, { "epoch": 0.6786668362803715, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.564697265625, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8487082719802856, "num_tokens": 203627014.0, "step": 5335 }, { "epoch": 0.6787940465589619, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.19274139404297, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8612707853317261, "num_tokens": 203659189.0, "step": 5336 }, { "epoch": 0.6789212568375524, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.877105712890625, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8660584688186646, "num_tokens": 203695355.0, "step": 5337 }, { "epoch": 0.679048467116143, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.290164947509766, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.86412513256073, "num_tokens": 203733446.0, "step": 5338 }, { "epoch": 0.6791756773947335, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.590410232543945, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8395309448242188, "num_tokens": 203775387.0, "step": 5339 }, { "epoch": 0.679302887673324, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 31.01416015625, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8516412973403931, "num_tokens": 203810713.0, "step": 5340 }, { "epoch": 0.6794300979519146, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.363765716552734, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.867878258228302, "num_tokens": 203847144.0, "step": 5341 }, { "epoch": 0.679557308230505, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.738229751586914, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8613535761833191, "num_tokens": 203889716.0, "step": 5342 }, { "epoch": 0.6796845185090955, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.35077476501465, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8411474227905273, "num_tokens": 203927692.0, "step": 5343 }, { "epoch": 0.679811728787686, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.863554000854492, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8420761823654175, "num_tokens": 203960180.0, "step": 5344 }, { "epoch": 0.6799389390662766, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.196657180786133, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.866737961769104, "num_tokens": 203997375.0, "step": 5345 }, { "epoch": 0.6800661493448671, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 31.049556732177734, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8514187335968018, "num_tokens": 204037588.0, "step": 5346 }, { "epoch": 0.6801933596234576, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.336894989013672, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8475744128227234, "num_tokens": 204080993.0, "step": 5347 }, { "epoch": 0.680320569902048, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.7425594329834, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8570338487625122, "num_tokens": 204116863.0, "step": 5348 }, { "epoch": 0.6804477801806386, "ewc_loss": 0.076171875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.103515625e-05, "grad_norm": 30.249736785888672, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8480837345123291, "num_tokens": 204156551.0, "step": 5349 }, { "epoch": 0.6805749904592291, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.830642700195312, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.848813533782959, "num_tokens": 204196210.0, "step": 5350 }, { "epoch": 0.6807022007378196, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 29.966638565063477, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8534461259841919, "num_tokens": 204234791.0, "step": 5351 }, { "epoch": 0.6808294110164101, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.695789337158203, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8572021722793579, "num_tokens": 204271631.0, "step": 5352 }, { "epoch": 0.6809566212950007, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.39498519897461, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8658985495567322, "num_tokens": 204311879.0, "step": 5353 }, { "epoch": 0.6810838315735911, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.569259643554688, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8581064343452454, "num_tokens": 204357123.0, "step": 5354 }, { "epoch": 0.6812110418521816, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.76852798461914, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8536084294319153, "num_tokens": 204397196.0, "step": 5355 }, { "epoch": 0.6813382521307721, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.709625244140625, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8446378707885742, "num_tokens": 204434654.0, "step": 5356 }, { "epoch": 0.6814654624093627, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.6922664642334, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8774024248123169, "num_tokens": 204466581.0, "step": 5357 }, { "epoch": 0.6815926726879532, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.816396713256836, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8671473264694214, "num_tokens": 204503890.0, "step": 5358 }, { "epoch": 0.6817198829665437, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.602766036987305, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8512018322944641, "num_tokens": 204537160.0, "step": 5359 }, { "epoch": 0.6818470932451343, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.655797958374023, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8648374080657959, "num_tokens": 204570077.0, "step": 5360 }, { "epoch": 0.6819743035237247, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.22145652770996, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8713721036911011, "num_tokens": 204603638.0, "step": 5361 }, { "epoch": 0.6821015138023152, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.723575592041016, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.8517965078353882, "num_tokens": 204644584.0, "step": 5362 }, { "epoch": 0.6822287240809057, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.3914737701416, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8420987129211426, "num_tokens": 204678459.0, "step": 5363 }, { "epoch": 0.6823559343594963, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.395122528076172, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8512543439865112, "num_tokens": 204712923.0, "step": 5364 }, { "epoch": 0.6824831446380868, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.57255744934082, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8381600379943848, "num_tokens": 204751303.0, "step": 5365 }, { "epoch": 0.6826103549166773, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.46044921875, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8563641309738159, "num_tokens": 204791685.0, "step": 5366 }, { "epoch": 0.6827375651952677, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.474079132080078, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8548988103866577, "num_tokens": 204827748.0, "step": 5367 }, { "epoch": 0.6828647754738583, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.30283546447754, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8717061281204224, "num_tokens": 204866568.0, "step": 5368 }, { "epoch": 0.6829919857524488, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.8585147857666, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8549038171768188, "num_tokens": 204908334.0, "step": 5369 }, { "epoch": 0.6831191960310393, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.523191452026367, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8590410351753235, "num_tokens": 204949929.0, "step": 5370 }, { "epoch": 0.6832464063096299, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.530630111694336, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8452385663986206, "num_tokens": 204991435.0, "step": 5371 }, { "epoch": 0.6833736165882204, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.20958137512207, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8610373139381409, "num_tokens": 205028434.0, "step": 5372 }, { "epoch": 0.6835008268668108, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.80398941040039, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.849694013595581, "num_tokens": 205064138.0, "step": 5373 }, { "epoch": 0.6836280371454013, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.264141082763672, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8475013375282288, "num_tokens": 205104058.0, "step": 5374 }, { "epoch": 0.6837552474239919, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.401105880737305, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8540439605712891, "num_tokens": 205142609.0, "step": 5375 }, { "epoch": 0.6838824577025824, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.251684188842773, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.866044819355011, "num_tokens": 205180407.0, "step": 5376 }, { "epoch": 0.6840096679811729, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.792856216430664, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8759998083114624, "num_tokens": 205217340.0, "step": 5377 }, { "epoch": 0.6841368782597634, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.39763069152832, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8437517285346985, "num_tokens": 205263671.0, "step": 5378 }, { "epoch": 0.6842640885383539, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.108606338500977, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8513731956481934, "num_tokens": 205302970.0, "step": 5379 }, { "epoch": 0.6843912988169444, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.56976890563965, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8583273887634277, "num_tokens": 205342247.0, "step": 5380 }, { "epoch": 0.6845185090955349, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.832122802734375, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8410027623176575, "num_tokens": 205374897.0, "step": 5381 }, { "epoch": 0.6846457193741254, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.735137939453125, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8524700403213501, "num_tokens": 205411645.0, "step": 5382 }, { "epoch": 0.684772929652716, "ewc_loss": 0.07666015625, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.41432762145996, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8542928695678711, "num_tokens": 205450972.0, "step": 5383 }, { "epoch": 0.6849001399313065, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.36971664428711, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8569055795669556, "num_tokens": 205494314.0, "step": 5384 }, { "epoch": 0.6850273502098969, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.655702590942383, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.842903733253479, "num_tokens": 205534696.0, "step": 5385 }, { "epoch": 0.6851545604884874, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.456218719482422, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8677835464477539, "num_tokens": 205575579.0, "step": 5386 }, { "epoch": 0.685281770767078, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.198883056640625e-05, "grad_norm": 30.80141830444336, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8582488298416138, "num_tokens": 205609249.0, "step": 5387 }, { "epoch": 0.6854089810456685, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.218795776367188, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8526327610015869, "num_tokens": 205644135.0, "step": 5388 }, { "epoch": 0.685536191324259, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.922391891479492, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8488335609436035, "num_tokens": 205685312.0, "step": 5389 }, { "epoch": 0.6856634016028496, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.252267837524414, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8488385677337646, "num_tokens": 205720525.0, "step": 5390 }, { "epoch": 0.68579061188144, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.531082153320312, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8361680507659912, "num_tokens": 205763358.0, "step": 5391 }, { "epoch": 0.6859178221600305, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.447940826416016, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8584659099578857, "num_tokens": 205801413.0, "step": 5392 }, { "epoch": 0.686045032438621, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.833660125732422, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8567277789115906, "num_tokens": 205840442.0, "step": 5393 }, { "epoch": 0.6861722427172116, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.30525016784668, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8680744171142578, "num_tokens": 205877319.0, "step": 5394 }, { "epoch": 0.6862994529958021, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.841854095458984, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8524720072746277, "num_tokens": 205916186.0, "step": 5395 }, { "epoch": 0.6864266632743926, "ewc_loss": 0.078125, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.432254791259766, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.843646764755249, "num_tokens": 205956336.0, "step": 5396 }, { "epoch": 0.686553873552983, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.847143173217773, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8519190549850464, "num_tokens": 205989570.0, "step": 5397 }, { "epoch": 0.6866810838315736, "ewc_loss": 0.07763671875, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.42381477355957, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8468987941741943, "num_tokens": 206026139.0, "step": 5398 }, { "epoch": 0.6868082941101641, "ewc_loss": 0.078125, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.272079467773438, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8489128351211548, "num_tokens": 206070059.0, "step": 5399 }, { "epoch": 0.6869355043887546, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.48644256591797, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8589597940444946, "num_tokens": 206104645.0, "step": 5400 }, { "epoch": 0.6870627146673451, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.485504150390625, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8527979850769043, "num_tokens": 206141220.0, "step": 5401 }, { "epoch": 0.6871899249459357, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.753551483154297, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8529067039489746, "num_tokens": 206177917.0, "step": 5402 }, { "epoch": 0.6873171352245261, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.11784553527832, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8541985154151917, "num_tokens": 206213419.0, "step": 5403 }, { "epoch": 0.6874443455031166, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.75779914855957, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8386145830154419, "num_tokens": 206252421.0, "step": 5404 }, { "epoch": 0.6875715557817071, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.36382484436035, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8604269623756409, "num_tokens": 206287195.0, "step": 5405 }, { "epoch": 0.6876987660602977, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.459060668945312, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8363438248634338, "num_tokens": 206328339.0, "step": 5406 }, { "epoch": 0.6878259763388882, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.824522018432617, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8389116525650024, "num_tokens": 206369523.0, "step": 5407 }, { "epoch": 0.6879531866174787, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.129396438598633, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8743381500244141, "num_tokens": 206402864.0, "step": 5408 }, { "epoch": 0.6880803968960693, "ewc_loss": 0.07861328125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 31.13836669921875, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8430144190788269, "num_tokens": 206439004.0, "step": 5409 }, { "epoch": 0.6882076071746597, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.101411819458008, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8561109304428101, "num_tokens": 206472320.0, "step": 5410 }, { "epoch": 0.6883348174532502, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.956117630004883, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8469204902648926, "num_tokens": 206518649.0, "step": 5411 }, { "epoch": 0.6884620277318407, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.247549057006836, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8701490163803101, "num_tokens": 206551318.0, "step": 5412 }, { "epoch": 0.6885892380104313, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.760772705078125, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8423854112625122, "num_tokens": 206588173.0, "step": 5413 }, { "epoch": 0.6887164482890218, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.47722625732422, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8538838624954224, "num_tokens": 206624013.0, "step": 5414 }, { "epoch": 0.6888436585676123, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.502756118774414, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8607416749000549, "num_tokens": 206668180.0, "step": 5415 }, { "epoch": 0.6889708688462027, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.62380027770996, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8491195440292358, "num_tokens": 206710619.0, "step": 5416 }, { "epoch": 0.6890980791247933, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.430191040039062, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8592464923858643, "num_tokens": 206750121.0, "step": 5417 }, { "epoch": 0.6892252894033838, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.803354263305664, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8662586808204651, "num_tokens": 206789538.0, "step": 5418 }, { "epoch": 0.6893524996819743, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.64752960205078, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8544761538505554, "num_tokens": 206827513.0, "step": 5419 }, { "epoch": 0.6894797099605648, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.438161849975586, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.861108660697937, "num_tokens": 206866563.0, "step": 5420 }, { "epoch": 0.6896069202391554, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.429515838623047, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8650896549224854, "num_tokens": 206906281.0, "step": 5421 }, { "epoch": 0.6897341305177458, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.607208251953125, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8364623785018921, "num_tokens": 206944055.0, "step": 5422 }, { "epoch": 0.6898613407963363, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.594568252563477, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.854206919670105, "num_tokens": 206984421.0, "step": 5423 }, { "epoch": 0.6899885510749268, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.737951278686523, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8442907929420471, "num_tokens": 207020319.0, "step": 5424 }, { "epoch": 0.6901157613535174, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.64299774169922, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8570483326911926, "num_tokens": 207061487.0, "step": 5425 }, { "epoch": 0.6902429716321079, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.215688705444336, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.856419563293457, "num_tokens": 207098343.0, "step": 5426 }, { "epoch": 0.6903701819106984, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.787220001220703, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8518944382667542, "num_tokens": 207140381.0, "step": 5427 }, { "epoch": 0.6904973921892888, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.593610763549805, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8671369552612305, "num_tokens": 207173923.0, "step": 5428 }, { "epoch": 0.6906246024678794, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.646167755126953, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.861208438873291, "num_tokens": 207214840.0, "step": 5429 }, { "epoch": 0.6907518127464699, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.74839973449707, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8526458144187927, "num_tokens": 207247850.0, "step": 5430 }, { "epoch": 0.6908790230250604, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.584400177001953, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8554630279541016, "num_tokens": 207286698.0, "step": 5431 }, { "epoch": 0.691006233303651, "ewc_loss": 0.078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.61803436279297, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8496769666671753, "num_tokens": 207322470.0, "step": 5432 }, { "epoch": 0.6911334435822415, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.627981185913086, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8411574959754944, "num_tokens": 207361517.0, "step": 5433 }, { "epoch": 0.6912606538608319, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.54121971130371, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8646684885025024, "num_tokens": 207394504.0, "step": 5434 }, { "epoch": 0.6913878641394224, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.85819435119629, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8364665508270264, "num_tokens": 207430148.0, "step": 5435 }, { "epoch": 0.691515074418013, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.325700759887695, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8516316413879395, "num_tokens": 207461094.0, "step": 5436 }, { "epoch": 0.6916422846966035, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.9125919342041, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.861444354057312, "num_tokens": 207500572.0, "step": 5437 }, { "epoch": 0.691769494975194, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.126754760742188, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8481765985488892, "num_tokens": 207534977.0, "step": 5438 }, { "epoch": 0.6918967052537845, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.896160125732422, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8601852059364319, "num_tokens": 207569809.0, "step": 5439 }, { "epoch": 0.692023915532375, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.337772369384766, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8564126491546631, "num_tokens": 207606785.0, "step": 5440 }, { "epoch": 0.6921511258109655, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.73118019104004, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8533414006233215, "num_tokens": 207648344.0, "step": 5441 }, { "epoch": 0.692278336089556, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.68954086303711, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8621158599853516, "num_tokens": 207680962.0, "step": 5442 }, { "epoch": 0.6924055463681466, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.553415298461914, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8864766955375671, "num_tokens": 207709162.0, "step": 5443 }, { "epoch": 0.6925327566467371, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.94697380065918, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8515545129776001, "num_tokens": 207746089.0, "step": 5444 }, { "epoch": 0.6926599669253276, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.34711456298828, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8664746284484863, "num_tokens": 207791235.0, "step": 5445 }, { "epoch": 0.692787177203918, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.80420684814453, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.847895622253418, "num_tokens": 207824527.0, "step": 5446 }, { "epoch": 0.6929143874825086, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.53123664855957, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8660910129547119, "num_tokens": 207863535.0, "step": 5447 }, { "epoch": 0.6930415977610991, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.491806030273438, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8499962687492371, "num_tokens": 207899694.0, "step": 5448 }, { "epoch": 0.6931688080396896, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.00267219543457, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8460630178451538, "num_tokens": 207931440.0, "step": 5449 }, { "epoch": 0.6932960183182801, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.3779239654541, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8527052402496338, "num_tokens": 207963758.0, "step": 5450 }, { "epoch": 0.6934232285968707, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.65863800048828, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8603231906890869, "num_tokens": 208002383.0, "step": 5451 }, { "epoch": 0.6935504388754611, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.246566772460938e-05, "grad_norm": 30.052886962890625, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8590394258499146, "num_tokens": 208038723.0, "step": 5452 }, { "epoch": 0.6936776491540516, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.194320678710938, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8595476150512695, "num_tokens": 208071851.0, "step": 5453 }, { "epoch": 0.6938048594326421, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.35344123840332, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8377867937088013, "num_tokens": 208106916.0, "step": 5454 }, { "epoch": 0.6939320697112327, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.648441314697266, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8640141487121582, "num_tokens": 208146883.0, "step": 5455 }, { "epoch": 0.6940592799898232, "ewc_loss": 0.0771484375, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.151199340820312e-05, "grad_norm": 30.514209747314453, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8578997254371643, "num_tokens": 208187364.0, "step": 5456 }, { "epoch": 0.6941864902684137, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.199462890625, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8624162077903748, "num_tokens": 208226087.0, "step": 5457 }, { "epoch": 0.6943137005470043, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.75678825378418, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8516926765441895, "num_tokens": 208265193.0, "step": 5458 }, { "epoch": 0.6944409108255947, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.172691345214844, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8683924674987793, "num_tokens": 208305163.0, "step": 5459 }, { "epoch": 0.6945681211041852, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.483211517333984, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8442292213439941, "num_tokens": 208342229.0, "step": 5460 }, { "epoch": 0.6946953313827757, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.173973083496094, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8448747992515564, "num_tokens": 208384606.0, "step": 5461 }, { "epoch": 0.6948225416613663, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.864770889282227, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8736870884895325, "num_tokens": 208416777.0, "step": 5462 }, { "epoch": 0.6949497519399568, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.02396011352539, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.841274619102478, "num_tokens": 208456101.0, "step": 5463 }, { "epoch": 0.6950769622185473, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.350473403930664, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8647502660751343, "num_tokens": 208489201.0, "step": 5464 }, { "epoch": 0.6952041724971377, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.237104415893555, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8693703413009644, "num_tokens": 208525006.0, "step": 5465 }, { "epoch": 0.6953313827757283, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.643800735473633, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8528869152069092, "num_tokens": 208568997.0, "step": 5466 }, { "epoch": 0.6954585930543188, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.701099395751953, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8465876579284668, "num_tokens": 208605925.0, "step": 5467 }, { "epoch": 0.6955858033329093, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.586896896362305, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8312371969223022, "num_tokens": 208646290.0, "step": 5468 }, { "epoch": 0.6957130136114998, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.986814498901367, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8561810255050659, "num_tokens": 208686424.0, "step": 5469 }, { "epoch": 0.6958402238900904, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.743000030517578, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8569067716598511, "num_tokens": 208719643.0, "step": 5470 }, { "epoch": 0.6959674341686808, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.766000747680664, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8390262126922607, "num_tokens": 208760804.0, "step": 5471 }, { "epoch": 0.6960946444472713, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.886547088623047, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8504584431648254, "num_tokens": 208803022.0, "step": 5472 }, { "epoch": 0.6962218547258618, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.029830932617188, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8571600914001465, "num_tokens": 208844205.0, "step": 5473 }, { "epoch": 0.6963490650044524, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.324142456054688, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8663655519485474, "num_tokens": 208886882.0, "step": 5474 }, { "epoch": 0.6964762752830429, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.78759765625, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8530600070953369, "num_tokens": 208926649.0, "step": 5475 }, { "epoch": 0.6966034855616334, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.692094802856445, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8494836091995239, "num_tokens": 208963403.0, "step": 5476 }, { "epoch": 0.6967306958402238, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.496482849121094, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8478950262069702, "num_tokens": 209002475.0, "step": 5477 }, { "epoch": 0.6968579061188144, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.497652053833008, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8553853631019592, "num_tokens": 209044216.0, "step": 5478 }, { "epoch": 0.6969851163974049, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.794143676757812, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8441422581672668, "num_tokens": 209080774.0, "step": 5479 }, { "epoch": 0.6971123266759954, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.899417877197266, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8607778549194336, "num_tokens": 209117167.0, "step": 5480 }, { "epoch": 0.697239536954586, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.730220794677734, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8484145402908325, "num_tokens": 209148872.0, "step": 5481 }, { "epoch": 0.6973667472331765, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.790512084960938, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8564308881759644, "num_tokens": 209187912.0, "step": 5482 }, { "epoch": 0.6974939575117669, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.975927352905273, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8628768920898438, "num_tokens": 209227055.0, "step": 5483 }, { "epoch": 0.6976211677903574, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.773284912109375, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8682535886764526, "num_tokens": 209255343.0, "step": 5484 }, { "epoch": 0.697748378068948, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.189109802246094, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8484957218170166, "num_tokens": 209294621.0, "step": 5485 }, { "epoch": 0.6978755883475385, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.492887496948242, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8693007230758667, "num_tokens": 209336519.0, "step": 5486 }, { "epoch": 0.698002798626129, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 31.042598724365234, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.852545440196991, "num_tokens": 209372913.0, "step": 5487 }, { "epoch": 0.6981300089047195, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.937664031982422, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.845874547958374, "num_tokens": 209417768.0, "step": 5488 }, { "epoch": 0.69825721918331, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.692716598510742, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.857539176940918, "num_tokens": 209451221.0, "step": 5489 }, { "epoch": 0.6983844294619005, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.945024490356445, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8555008172988892, "num_tokens": 209488710.0, "step": 5490 }, { "epoch": 0.698511639740491, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.962909698486328, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8506923913955688, "num_tokens": 209525166.0, "step": 5491 }, { "epoch": 0.6986388500190815, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.745849609375, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8575965166091919, "num_tokens": 209564804.0, "step": 5492 }, { "epoch": 0.6987660602976721, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.018190383911133, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8590298891067505, "num_tokens": 209607155.0, "step": 5493 }, { "epoch": 0.6988932705762626, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.684799194335938, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8445886373519897, "num_tokens": 209641226.0, "step": 5494 }, { "epoch": 0.699020480854853, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.785030364990234, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.850858211517334, "num_tokens": 209679180.0, "step": 5495 }, { "epoch": 0.6991476911334435, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.769424438476562, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8471056222915649, "num_tokens": 209716412.0, "step": 5496 }, { "epoch": 0.6992749014120341, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.489919662475586, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8464099764823914, "num_tokens": 209755697.0, "step": 5497 }, { "epoch": 0.6994021116906246, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.82907485961914, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8613743782043457, "num_tokens": 209793654.0, "step": 5498 }, { "epoch": 0.6995293219692151, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.56360626220703, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.853832483291626, "num_tokens": 209830703.0, "step": 5499 }, { "epoch": 0.6996565322478057, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.58039665222168, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.855932354927063, "num_tokens": 209868994.0, "step": 5500 }, { "epoch": 0.6997837425263961, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.747360229492188, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8641675710678101, "num_tokens": 209909605.0, "step": 5501 }, { "epoch": 0.6999109528049866, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.601276397705078, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.860719621181488, "num_tokens": 209945196.0, "step": 5502 }, { "epoch": 0.7000381630835771, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.539690017700195, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8550384044647217, "num_tokens": 209988585.0, "step": 5503 }, { "epoch": 0.7001653733621677, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.452804565429688, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8552027940750122, "num_tokens": 210032084.0, "step": 5504 }, { "epoch": 0.7002925836407582, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.969379425048828, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8441725969314575, "num_tokens": 210070639.0, "step": 5505 }, { "epoch": 0.7004197939193487, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.421796798706055, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8629048466682434, "num_tokens": 210113793.0, "step": 5506 }, { "epoch": 0.7005470041979391, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.568721771240234, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8470991849899292, "num_tokens": 210153517.0, "step": 5507 }, { "epoch": 0.7006742144765297, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.487960815429688, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.850263774394989, "num_tokens": 210186818.0, "step": 5508 }, { "epoch": 0.7008014247551202, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.543128967285156, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8466609716415405, "num_tokens": 210224283.0, "step": 5509 }, { "epoch": 0.7009286350337107, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.659114837646484, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.849581241607666, "num_tokens": 210263439.0, "step": 5510 }, { "epoch": 0.7010558453123013, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.777318954467773, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8584633469581604, "num_tokens": 210305887.0, "step": 5511 }, { "epoch": 0.7011830555908918, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.747268676757812, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8538081645965576, "num_tokens": 210340118.0, "step": 5512 }, { "epoch": 0.7013102658694823, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.544588088989258, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8572599291801453, "num_tokens": 210376098.0, "step": 5513 }, { "epoch": 0.7014374761480727, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.45195198059082, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8587396144866943, "num_tokens": 210418801.0, "step": 5514 }, { "epoch": 0.7015646864266633, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.694320678710938, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8440640568733215, "num_tokens": 210452392.0, "step": 5515 }, { "epoch": 0.7016918967052538, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.718591690063477, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.862572431564331, "num_tokens": 210489697.0, "step": 5516 }, { "epoch": 0.7018191069838443, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.821762084960938, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8394980430603027, "num_tokens": 210530212.0, "step": 5517 }, { "epoch": 0.7019463172624348, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.845165252685547, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8606099486351013, "num_tokens": 210564974.0, "step": 5518 }, { "epoch": 0.7020735275410254, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.748159408569336, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8547408580780029, "num_tokens": 210601272.0, "step": 5519 }, { "epoch": 0.7022007378196158, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.738759994506836, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.856812059879303, "num_tokens": 210636908.0, "step": 5520 }, { "epoch": 0.7023279480982063, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.239452362060547, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.851638674736023, "num_tokens": 210672629.0, "step": 5521 }, { "epoch": 0.7024551583767968, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.66808319091797, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8684777617454529, "num_tokens": 210703531.0, "step": 5522 }, { "epoch": 0.7025823686553874, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.078088760375977, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8678877353668213, "num_tokens": 210742946.0, "step": 5523 }, { "epoch": 0.7027095789339779, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.29568862915039, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8637987971305847, "num_tokens": 210778345.0, "step": 5524 }, { "epoch": 0.7028367892125684, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.573579788208008, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8624849319458008, "num_tokens": 210821544.0, "step": 5525 }, { "epoch": 0.7029639994911588, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.00795555114746, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8595221042633057, "num_tokens": 210858060.0, "step": 5526 }, { "epoch": 0.7030912097697494, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.00346565246582, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8506085276603699, "num_tokens": 210900365.0, "step": 5527 }, { "epoch": 0.7032184200483399, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.94308090209961, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8503525257110596, "num_tokens": 210941118.0, "step": 5528 }, { "epoch": 0.7033456303269304, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.899120330810547, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8620457649230957, "num_tokens": 210979865.0, "step": 5529 }, { "epoch": 0.703472840605521, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.93764877319336, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8555405139923096, "num_tokens": 211016385.0, "step": 5530 }, { "epoch": 0.7036000508841115, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.981504440307617, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8436864614486694, "num_tokens": 211059071.0, "step": 5531 }, { "epoch": 0.7037272611627019, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.01251792907715, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8576595187187195, "num_tokens": 211094189.0, "step": 5532 }, { "epoch": 0.7038544714412924, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.05397605895996, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8692535758018494, "num_tokens": 211130524.0, "step": 5533 }, { "epoch": 0.703981681719883, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.027416229248047, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8487961292266846, "num_tokens": 211172335.0, "step": 5534 }, { "epoch": 0.7041088919984735, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.75901222229004, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8571304082870483, "num_tokens": 211205406.0, "step": 5535 }, { "epoch": 0.704236102277064, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.42635154724121, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8514702320098877, "num_tokens": 211246443.0, "step": 5536 }, { "epoch": 0.7043633125556545, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.75676727294922, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8694682121276855, "num_tokens": 211286707.0, "step": 5537 }, { "epoch": 0.704490522834245, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 31.581287384033203, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8659805059432983, "num_tokens": 211319448.0, "step": 5538 }, { "epoch": 0.7046177331128355, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.66847038269043, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8508431911468506, "num_tokens": 211363747.0, "step": 5539 }, { "epoch": 0.704744943391426, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.96876335144043, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8622417449951172, "num_tokens": 211408080.0, "step": 5540 }, { "epoch": 0.7048721536700165, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.773815155029297, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8462114334106445, "num_tokens": 211445656.0, "step": 5541 }, { "epoch": 0.7049993639486071, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.843399047851562, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8635013103485107, "num_tokens": 211484342.0, "step": 5542 }, { "epoch": 0.7051265742271976, "ewc_loss": 0.078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.29425048828125e-05, "grad_norm": 30.625167846679688, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8498186469078064, "num_tokens": 211518535.0, "step": 5543 }, { "epoch": 0.705253784505788, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.621919631958008, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.850648820400238, "num_tokens": 211557236.0, "step": 5544 }, { "epoch": 0.7053809947843785, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.83935546875, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8677093982696533, "num_tokens": 211599216.0, "step": 5545 }, { "epoch": 0.7055082050629691, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.706260681152344, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8373547792434692, "num_tokens": 211638930.0, "step": 5546 }, { "epoch": 0.7056354153415596, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.0555477142334, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.848376989364624, "num_tokens": 211680765.0, "step": 5547 }, { "epoch": 0.7057626256201501, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 30.841556549072266, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8696584701538086, "num_tokens": 211719037.0, "step": 5548 }, { "epoch": 0.7058898358987407, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.01332664489746, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8498960733413696, "num_tokens": 211759655.0, "step": 5549 }, { "epoch": 0.7060170461773311, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.021419525146484, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8377267718315125, "num_tokens": 211800405.0, "step": 5550 }, { "epoch": 0.7061442564559216, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.95604133605957, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8375902771949768, "num_tokens": 211846057.0, "step": 5551 }, { "epoch": 0.7062714667345121, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.852176666259766, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8523961901664734, "num_tokens": 211885944.0, "step": 5552 }, { "epoch": 0.7063986770131027, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.978015899658203, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8587395548820496, "num_tokens": 211928727.0, "step": 5553 }, { "epoch": 0.7065258872916932, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.145845413208008, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8448526859283447, "num_tokens": 211967593.0, "step": 5554 }, { "epoch": 0.7066530975702837, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.478515625, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8501909375190735, "num_tokens": 212001023.0, "step": 5555 }, { "epoch": 0.7067803078488741, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.013513565063477, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8478013277053833, "num_tokens": 212045611.0, "step": 5556 }, { "epoch": 0.7069075181274647, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.417686462402344, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8505346179008484, "num_tokens": 212087167.0, "step": 5557 }, { "epoch": 0.7070347284060552, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.021604537963867, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8559638857841492, "num_tokens": 212124254.0, "step": 5558 }, { "epoch": 0.7071619386846457, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.862571716308594, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8492072820663452, "num_tokens": 212162706.0, "step": 5559 }, { "epoch": 0.7072891489632362, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.083141326904297, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8534072637557983, "num_tokens": 212197359.0, "step": 5560 }, { "epoch": 0.7074163592418268, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.990846633911133, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8607883453369141, "num_tokens": 212237300.0, "step": 5561 }, { "epoch": 0.7075435695204173, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.8298397064209, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8358643651008606, "num_tokens": 212283472.0, "step": 5562 }, { "epoch": 0.7076707797990077, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.28592300415039, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8505499958992004, "num_tokens": 212321433.0, "step": 5563 }, { "epoch": 0.7077979900775982, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.44451904296875, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8534741401672363, "num_tokens": 212362234.0, "step": 5564 }, { "epoch": 0.7079252003561888, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.390357971191406, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8455264568328857, "num_tokens": 212399259.0, "step": 5565 }, { "epoch": 0.7080524106347793, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.482885360717773, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.854791522026062, "num_tokens": 212438489.0, "step": 5566 }, { "epoch": 0.7081796209133698, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.462993621826172, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8454055786132812, "num_tokens": 212482283.0, "step": 5567 }, { "epoch": 0.7083068311919604, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.439638137817383, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8501784205436707, "num_tokens": 212522749.0, "step": 5568 }, { "epoch": 0.7084340414705508, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.389780044555664, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8466743230819702, "num_tokens": 212559937.0, "step": 5569 }, { "epoch": 0.7085612517491413, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.593849182128906, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8597539663314819, "num_tokens": 212595578.0, "step": 5570 }, { "epoch": 0.7086884620277318, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.04046058654785, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8662542104721069, "num_tokens": 212634085.0, "step": 5571 }, { "epoch": 0.7088156723063224, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.79844093322754, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8427078127861023, "num_tokens": 212673557.0, "step": 5572 }, { "epoch": 0.7089428825849129, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.689041137695312, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8649605512619019, "num_tokens": 212711142.0, "step": 5573 }, { "epoch": 0.7090700928635034, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.08095932006836, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8534362316131592, "num_tokens": 212745570.0, "step": 5574 }, { "epoch": 0.7091973031420938, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.81608009338379, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8580698370933533, "num_tokens": 212783048.0, "step": 5575 }, { "epoch": 0.7093245134206844, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.025575637817383, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8730975985527039, "num_tokens": 212821231.0, "step": 5576 }, { "epoch": 0.7094517236992749, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 32.34156036376953, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8517732620239258, "num_tokens": 212861715.0, "step": 5577 }, { "epoch": 0.7095789339778654, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 32.205169677734375, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.844589352607727, "num_tokens": 212900655.0, "step": 5578 }, { "epoch": 0.709706144256456, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.406949996948242, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8465532660484314, "num_tokens": 212939308.0, "step": 5579 }, { "epoch": 0.7098333545350465, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.28179359436035, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8594608902931213, "num_tokens": 212981174.0, "step": 5580 }, { "epoch": 0.7099605648136369, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.341934204101562e-05, "grad_norm": 31.14597511291504, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8647232055664062, "num_tokens": 213016401.0, "step": 5581 }, { "epoch": 0.7100877750922274, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.16461181640625, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8415806293487549, "num_tokens": 213058480.0, "step": 5582 }, { "epoch": 0.710214985370818, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.763607025146484, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8662185072898865, "num_tokens": 213093527.0, "step": 5583 }, { "epoch": 0.7103421956494085, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.2827091217041, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8514236211776733, "num_tokens": 213126778.0, "step": 5584 }, { "epoch": 0.710469405927999, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.667329788208008, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8582978248596191, "num_tokens": 213170863.0, "step": 5585 }, { "epoch": 0.7105966162065895, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.034652709960938, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8311019539833069, "num_tokens": 213211336.0, "step": 5586 }, { "epoch": 0.71072382648518, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.639339447021484, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8614339232444763, "num_tokens": 213245262.0, "step": 5587 }, { "epoch": 0.7108510367637705, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.17003059387207, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8466546535491943, "num_tokens": 213282150.0, "step": 5588 }, { "epoch": 0.710978247042361, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.49647331237793, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8603804111480713, "num_tokens": 213325573.0, "step": 5589 }, { "epoch": 0.7111054573209515, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.289676666259766, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8627858757972717, "num_tokens": 213363738.0, "step": 5590 }, { "epoch": 0.7112326675995421, "ewc_loss": 0.0791015625, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.354618072509766, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.860846221446991, "num_tokens": 213404796.0, "step": 5591 }, { "epoch": 0.7113598778781326, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.404987335205078, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.847081184387207, "num_tokens": 213436711.0, "step": 5592 }, { "epoch": 0.711487088156723, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.630868911743164, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8563629388809204, "num_tokens": 213473458.0, "step": 5593 }, { "epoch": 0.7116142984353135, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.15576171875, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8655447959899902, "num_tokens": 213507355.0, "step": 5594 }, { "epoch": 0.7117415087139041, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.612577438354492, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8371948003768921, "num_tokens": 213543771.0, "step": 5595 }, { "epoch": 0.7118687189924946, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.90108299255371, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8524488210678101, "num_tokens": 213578317.0, "step": 5596 }, { "epoch": 0.7119959292710851, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.76127815246582, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8508849143981934, "num_tokens": 213615795.0, "step": 5597 }, { "epoch": 0.7121231395496757, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.762767791748047, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.865027904510498, "num_tokens": 213655452.0, "step": 5598 }, { "epoch": 0.7122503498282661, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.802282333374023, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8455438613891602, "num_tokens": 213688079.0, "step": 5599 }, { "epoch": 0.7123775601068566, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.646957397460938, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8718973398208618, "num_tokens": 213725310.0, "step": 5600 }, { "epoch": 0.7125047703854471, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.644271850585938, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8488845825195312, "num_tokens": 213757950.0, "step": 5601 }, { "epoch": 0.7126319806640377, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.96986961364746, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.841516375541687, "num_tokens": 213794146.0, "step": 5602 }, { "epoch": 0.7127591909426282, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.882265090942383, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8580881953239441, "num_tokens": 213834922.0, "step": 5603 }, { "epoch": 0.7128864012212187, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.745162963867188, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8731831312179565, "num_tokens": 213879651.0, "step": 5604 }, { "epoch": 0.7130136114998091, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.993942260742188, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8543849587440491, "num_tokens": 213921833.0, "step": 5605 }, { "epoch": 0.7131408217783997, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.914318084716797, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.860079288482666, "num_tokens": 213962909.0, "step": 5606 }, { "epoch": 0.7132680320569902, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.817279815673828, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8563088178634644, "num_tokens": 214002030.0, "step": 5607 }, { "epoch": 0.7133952423355807, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.833967208862305, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8476386666297913, "num_tokens": 214037094.0, "step": 5608 }, { "epoch": 0.7135224526141712, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.998605728149414, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8599117994308472, "num_tokens": 214076649.0, "step": 5609 }, { "epoch": 0.7136496628927618, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.52052879333496, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8505544662475586, "num_tokens": 214114194.0, "step": 5610 }, { "epoch": 0.7137768731713523, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.051982879638672, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8564199209213257, "num_tokens": 214149652.0, "step": 5611 }, { "epoch": 0.7139040834499427, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.700672149658203, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8647141456604004, "num_tokens": 214189068.0, "step": 5612 }, { "epoch": 0.7140312937285332, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.968923568725586, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8429111242294312, "num_tokens": 214220251.0, "step": 5613 }, { "epoch": 0.7141585040071238, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.8803768157959, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8529213070869446, "num_tokens": 214256705.0, "step": 5614 }, { "epoch": 0.7142857142857143, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.885255813598633, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8669806718826294, "num_tokens": 214292563.0, "step": 5615 }, { "epoch": 0.7144129245643048, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.838424682617188, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8486109972000122, "num_tokens": 214333843.0, "step": 5616 }, { "epoch": 0.7145401348428954, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.90888786315918, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8667736053466797, "num_tokens": 214372272.0, "step": 5617 }, { "epoch": 0.7146673451214858, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.772733688354492, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8617905378341675, "num_tokens": 214416364.0, "step": 5618 }, { "epoch": 0.7147945554000763, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.263805389404297, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8537995219230652, "num_tokens": 214453102.0, "step": 5619 }, { "epoch": 0.7149217656786668, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.481807708740234, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8583784699440002, "num_tokens": 214492374.0, "step": 5620 }, { "epoch": 0.7150489759572574, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.305519104003906, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8539682626724243, "num_tokens": 214530918.0, "step": 5621 }, { "epoch": 0.7151761862358479, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.981454849243164, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8700109720230103, "num_tokens": 214570227.0, "step": 5622 }, { "epoch": 0.7153033965144384, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.087257385253906, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8571397066116333, "num_tokens": 214609519.0, "step": 5623 }, { "epoch": 0.7154306067930288, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.84061622619629, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8521087765693665, "num_tokens": 214643564.0, "step": 5624 }, { "epoch": 0.7155578170716194, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.48602867126465, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8636186122894287, "num_tokens": 214677632.0, "step": 5625 }, { "epoch": 0.7156850273502099, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.791898727416992, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8575013279914856, "num_tokens": 214710363.0, "step": 5626 }, { "epoch": 0.7158122376288004, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.251625061035156, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8638720512390137, "num_tokens": 214746982.0, "step": 5627 }, { "epoch": 0.715939447907391, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.004390716552734, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8456779718399048, "num_tokens": 214786447.0, "step": 5628 }, { "epoch": 0.7160666581859815, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.217754364013672, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8627668023109436, "num_tokens": 214824627.0, "step": 5629 }, { "epoch": 0.7161938684645719, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.471660614013672, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.835814893245697, "num_tokens": 214863051.0, "step": 5630 }, { "epoch": 0.7163210787431624, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.00187873840332, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8419284224510193, "num_tokens": 214900524.0, "step": 5631 }, { "epoch": 0.716448289021753, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.173410415649414, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8604952096939087, "num_tokens": 214943949.0, "step": 5632 }, { "epoch": 0.7165754993003435, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.174467086791992, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8596092462539673, "num_tokens": 214979126.0, "step": 5633 }, { "epoch": 0.716702709578934, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.884443283081055, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8552191853523254, "num_tokens": 215017740.0, "step": 5634 }, { "epoch": 0.7168299198575245, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.411977767944336, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8627384901046753, "num_tokens": 215054472.0, "step": 5635 }, { "epoch": 0.716957130136115, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.75586700439453, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8562465310096741, "num_tokens": 215086894.0, "step": 5636 }, { "epoch": 0.7170843404147055, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.193315505981445, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8566553592681885, "num_tokens": 215128157.0, "step": 5637 }, { "epoch": 0.717211550693296, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 30.93081283569336, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8630499839782715, "num_tokens": 215161008.0, "step": 5638 }, { "epoch": 0.7173387609718865, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.265888214111328, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8394033908843994, "num_tokens": 215197157.0, "step": 5639 }, { "epoch": 0.7174659712504771, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.008161544799805, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8626571297645569, "num_tokens": 215240270.0, "step": 5640 }, { "epoch": 0.7175931815290676, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.22005844116211, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.862293541431427, "num_tokens": 215279828.0, "step": 5641 }, { "epoch": 0.717720391807658, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.231740951538086, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8492355346679688, "num_tokens": 215315271.0, "step": 5642 }, { "epoch": 0.7178476020862485, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.091053009033203, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8568165302276611, "num_tokens": 215352621.0, "step": 5643 }, { "epoch": 0.7179748123648391, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.14229393005371, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8577109575271606, "num_tokens": 215390231.0, "step": 5644 }, { "epoch": 0.7181020226434296, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.000272750854492, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8697754144668579, "num_tokens": 215426812.0, "step": 5645 }, { "epoch": 0.7182292329220201, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.607885360717773, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8438395857810974, "num_tokens": 215467551.0, "step": 5646 }, { "epoch": 0.7183564432006107, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.650951385498047, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8641235828399658, "num_tokens": 215508559.0, "step": 5647 }, { "epoch": 0.7184836534792011, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.14400863647461, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8489594459533691, "num_tokens": 215548709.0, "step": 5648 }, { "epoch": 0.7186108637577916, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.045530319213867, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.855796217918396, "num_tokens": 215593844.0, "step": 5649 }, { "epoch": 0.7187380740363821, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.93631362915039, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8609007000923157, "num_tokens": 215633853.0, "step": 5650 }, { "epoch": 0.7188652843149727, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 30.992755889892578, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8547682762145996, "num_tokens": 215667272.0, "step": 5651 }, { "epoch": 0.7189924945935632, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.229387283325195, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8379026651382446, "num_tokens": 215705889.0, "step": 5652 }, { "epoch": 0.7191197048721537, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.051891326904297, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8647130131721497, "num_tokens": 215749143.0, "step": 5653 }, { "epoch": 0.7192469151507441, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.280942916870117, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8496586084365845, "num_tokens": 215787583.0, "step": 5654 }, { "epoch": 0.7193741254293347, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.37841033935547, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8563954830169678, "num_tokens": 215825552.0, "step": 5655 }, { "epoch": 0.7195013357079252, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.98553466796875, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8657348155975342, "num_tokens": 215858310.0, "step": 5656 }, { "epoch": 0.7196285459865157, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.31808090209961, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8450363874435425, "num_tokens": 215898403.0, "step": 5657 }, { "epoch": 0.7197557562651062, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.832645416259766, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8331945538520813, "num_tokens": 215935930.0, "step": 5658 }, { "epoch": 0.7198829665436968, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.653078079223633, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8525610566139221, "num_tokens": 215971509.0, "step": 5659 }, { "epoch": 0.7200101768222873, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.13658332824707, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8608266711235046, "num_tokens": 216008696.0, "step": 5660 }, { "epoch": 0.7201373871008777, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.35185432434082, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8561925888061523, "num_tokens": 216045861.0, "step": 5661 }, { "epoch": 0.7202645973794682, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.843219757080078, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8651332855224609, "num_tokens": 216081307.0, "step": 5662 }, { "epoch": 0.7203918076580588, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.944217681884766, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.865272045135498, "num_tokens": 216118506.0, "step": 5663 }, { "epoch": 0.7205190179366493, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.269155502319336, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8493461012840271, "num_tokens": 216159000.0, "step": 5664 }, { "epoch": 0.7206462282152398, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.40217399597168, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.848213255405426, "num_tokens": 216204712.0, "step": 5665 }, { "epoch": 0.7207734384938304, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.153059005737305, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8478837609291077, "num_tokens": 216235001.0, "step": 5666 }, { "epoch": 0.7209006487724208, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.68922996520996, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8482469320297241, "num_tokens": 216271383.0, "step": 5667 }, { "epoch": 0.7210278590510113, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.048828125, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.856160044670105, "num_tokens": 216307923.0, "step": 5668 }, { "epoch": 0.7211550693296018, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.366443634033203, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8741153478622437, "num_tokens": 216345147.0, "step": 5669 }, { "epoch": 0.7212822796081924, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.219165802001953, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8478874564170837, "num_tokens": 216386991.0, "step": 5670 }, { "epoch": 0.7214094898867829, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.31423568725586, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.859099805355072, "num_tokens": 216425534.0, "step": 5671 }, { "epoch": 0.7215367001653734, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.010971069335938, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.856419026851654, "num_tokens": 216466686.0, "step": 5672 }, { "epoch": 0.7216639104439638, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.280550003051758, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8685519695281982, "num_tokens": 216505245.0, "step": 5673 }, { "epoch": 0.7217911207225544, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.223299026489258, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8530435562133789, "num_tokens": 216538024.0, "step": 5674 }, { "epoch": 0.7219183310011449, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.160568237304688, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8555706739425659, "num_tokens": 216573722.0, "step": 5675 }, { "epoch": 0.7220455412797354, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.13518714904785, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.861207127571106, "num_tokens": 216610362.0, "step": 5676 }, { "epoch": 0.7221727515583259, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.618408203125, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8472137451171875, "num_tokens": 216648732.0, "step": 5677 }, { "epoch": 0.7222999618369165, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.017515182495117, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8649656772613525, "num_tokens": 216680324.0, "step": 5678 }, { "epoch": 0.7224271721155069, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.354660034179688, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.859346866607666, "num_tokens": 216712773.0, "step": 5679 }, { "epoch": 0.7225543823940974, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.033777236938477, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8480129241943359, "num_tokens": 216752538.0, "step": 5680 }, { "epoch": 0.722681592672688, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.23404884338379, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.86004638671875, "num_tokens": 216791795.0, "step": 5681 }, { "epoch": 0.7228088029512785, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.569225311279297, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8521891832351685, "num_tokens": 216831729.0, "step": 5682 }, { "epoch": 0.722936013229869, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.661991119384766, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8556705713272095, "num_tokens": 216869494.0, "step": 5683 }, { "epoch": 0.7230632235084595, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.275981903076172, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8374335169792175, "num_tokens": 216906672.0, "step": 5684 }, { "epoch": 0.72319043378705, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.25396728515625, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8628059029579163, "num_tokens": 216941587.0, "step": 5685 }, { "epoch": 0.7233176440656405, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.58251190185547, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8486535549163818, "num_tokens": 216981812.0, "step": 5686 }, { "epoch": 0.723444854344231, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.346105575561523, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8659316301345825, "num_tokens": 217024137.0, "step": 5687 }, { "epoch": 0.7235720646228215, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.07449722290039, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8689117431640625, "num_tokens": 217063753.0, "step": 5688 }, { "epoch": 0.7236992749014121, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.479551315307617, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8544207811355591, "num_tokens": 217105611.0, "step": 5689 }, { "epoch": 0.7238264851800026, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.094961166381836, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8358660936355591, "num_tokens": 217145989.0, "step": 5690 }, { "epoch": 0.723953695458593, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.403108596801758, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.874250590801239, "num_tokens": 217183726.0, "step": 5691 }, { "epoch": 0.7240809057371835, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.14967155456543, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8531087040901184, "num_tokens": 217226644.0, "step": 5692 }, { "epoch": 0.7242081160157741, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.136659622192383, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.841661810874939, "num_tokens": 217263327.0, "step": 5693 }, { "epoch": 0.7243353262943646, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.22348403930664, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8617987036705017, "num_tokens": 217300595.0, "step": 5694 }, { "epoch": 0.7244625365729551, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.617393493652344, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8423265814781189, "num_tokens": 217336385.0, "step": 5695 }, { "epoch": 0.7245897468515456, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.426807403564453, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8621070981025696, "num_tokens": 217374309.0, "step": 5696 }, { "epoch": 0.7247169571301361, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.64842414855957, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8465131521224976, "num_tokens": 217408386.0, "step": 5697 }, { "epoch": 0.7248441674087266, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.180030822753906, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8415530323982239, "num_tokens": 217442855.0, "step": 5698 }, { "epoch": 0.7249713776873171, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.886228561401367, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8611745834350586, "num_tokens": 217476892.0, "step": 5699 }, { "epoch": 0.7250985879659076, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.350244522094727, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8541173934936523, "num_tokens": 217513569.0, "step": 5700 }, { "epoch": 0.7252257982444982, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.840776443481445, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8501877784729004, "num_tokens": 217545585.0, "step": 5701 }, { "epoch": 0.7253530085230887, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.97520637512207, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8581086993217468, "num_tokens": 217583156.0, "step": 5702 }, { "epoch": 0.7254802188016791, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.823637008666992, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8557581901550293, "num_tokens": 217619630.0, "step": 5703 }, { "epoch": 0.7256074290802697, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.961698532104492, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8664246797561646, "num_tokens": 217653616.0, "step": 5704 }, { "epoch": 0.7257346393588602, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.179975509643555, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8511598706245422, "num_tokens": 217692758.0, "step": 5705 }, { "epoch": 0.7258618496374507, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.94462776184082, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8403227925300598, "num_tokens": 217734175.0, "step": 5706 }, { "epoch": 0.7259890599160412, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.109983444213867, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8333675861358643, "num_tokens": 217773372.0, "step": 5707 }, { "epoch": 0.7261162701946318, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.730148315429688, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.864693284034729, "num_tokens": 217809174.0, "step": 5708 }, { "epoch": 0.7262434804732223, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.152360916137695, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8479856252670288, "num_tokens": 217845882.0, "step": 5709 }, { "epoch": 0.7263706907518127, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.048925399780273, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8516098856925964, "num_tokens": 217885798.0, "step": 5710 }, { "epoch": 0.7264979010304032, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.861289978027344, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8522989749908447, "num_tokens": 217924967.0, "step": 5711 }, { "epoch": 0.7266251113089938, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.120880126953125, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8679509162902832, "num_tokens": 217967381.0, "step": 5712 }, { "epoch": 0.7267523215875843, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.055736541748047, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8601412773132324, "num_tokens": 218001097.0, "step": 5713 }, { "epoch": 0.7268795318661748, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.242265701293945, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8566597104072571, "num_tokens": 218040705.0, "step": 5714 }, { "epoch": 0.7270067421447653, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.701135635375977, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8536557555198669, "num_tokens": 218077549.0, "step": 5715 }, { "epoch": 0.7271339524233558, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.3035831451416, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.854963481426239, "num_tokens": 218116251.0, "step": 5716 }, { "epoch": 0.7272611627019463, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.542524337768555, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8291371464729309, "num_tokens": 218155798.0, "step": 5717 }, { "epoch": 0.7273883729805368, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.472370147705078, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8606961965560913, "num_tokens": 218191277.0, "step": 5718 }, { "epoch": 0.7275155832591274, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.021404266357422, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.861652135848999, "num_tokens": 218233165.0, "step": 5719 }, { "epoch": 0.7276427935377179, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.35987663269043, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8354520201683044, "num_tokens": 218271263.0, "step": 5720 }, { "epoch": 0.7277700038163084, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.139543533325195, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.856579065322876, "num_tokens": 218309081.0, "step": 5721 }, { "epoch": 0.7278972140948988, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.181804656982422, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8494153022766113, "num_tokens": 218348314.0, "step": 5722 }, { "epoch": 0.7280244243734894, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.144073486328125, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8583921194076538, "num_tokens": 218387556.0, "step": 5723 }, { "epoch": 0.7281516346520799, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.008270263671875, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8467949032783508, "num_tokens": 218423130.0, "step": 5724 }, { "epoch": 0.7282788449306704, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.221418380737305, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8371156454086304, "num_tokens": 218464079.0, "step": 5725 }, { "epoch": 0.7284060552092609, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.231895446777344, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8635861873626709, "num_tokens": 218501307.0, "step": 5726 }, { "epoch": 0.7285332654878515, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.947921752929688, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8641077280044556, "num_tokens": 218537710.0, "step": 5727 }, { "epoch": 0.7286604757664419, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.098888397216797, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8509060144424438, "num_tokens": 218576026.0, "step": 5728 }, { "epoch": 0.7287876860450324, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.515714645385742, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8521360158920288, "num_tokens": 218612491.0, "step": 5729 }, { "epoch": 0.7289148963236229, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.983545303344727, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.864051342010498, "num_tokens": 218642541.0, "step": 5730 }, { "epoch": 0.7290421066022135, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.87004852294922, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8405550718307495, "num_tokens": 218678514.0, "step": 5731 }, { "epoch": 0.729169316880804, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.792612075805664, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8588554859161377, "num_tokens": 218715415.0, "step": 5732 }, { "epoch": 0.7292965271593945, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.593061447143555, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8633872270584106, "num_tokens": 218754422.0, "step": 5733 }, { "epoch": 0.7294237374379849, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.56741714477539, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8462333679199219, "num_tokens": 218787141.0, "step": 5734 }, { "epoch": 0.7295509477165755, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.45475196838379, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8700193166732788, "num_tokens": 218824764.0, "step": 5735 }, { "epoch": 0.729678157995166, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.31743049621582, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8547095656394958, "num_tokens": 218866142.0, "step": 5736 }, { "epoch": 0.7298053682737565, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.291831970214844, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8503483533859253, "num_tokens": 218909208.0, "step": 5737 }, { "epoch": 0.7299325785523471, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.56675148010254, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8625736236572266, "num_tokens": 218942469.0, "step": 5738 }, { "epoch": 0.7300597888309376, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 30.888761520385742, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8480982780456543, "num_tokens": 218982934.0, "step": 5739 }, { "epoch": 0.730186999109528, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.910625457763672, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8514047861099243, "num_tokens": 219027055.0, "step": 5740 }, { "epoch": 0.7303142093881185, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.390249252319336, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8570441007614136, "num_tokens": 219069639.0, "step": 5741 }, { "epoch": 0.7304414196667091, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.522083282470703, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8496437668800354, "num_tokens": 219111774.0, "step": 5742 }, { "epoch": 0.7305686299452996, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.717853546142578, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8422088623046875, "num_tokens": 219156962.0, "step": 5743 }, { "epoch": 0.7306958402238901, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.406173706054688, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.863945722579956, "num_tokens": 219193352.0, "step": 5744 }, { "epoch": 0.7308230505024806, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.6713809967041, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8402249813079834, "num_tokens": 219230041.0, "step": 5745 }, { "epoch": 0.7309502607810711, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.765972137451172, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8322548866271973, "num_tokens": 219262748.0, "step": 5746 }, { "epoch": 0.7310774710596616, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.360883712768555, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8666924238204956, "num_tokens": 219296556.0, "step": 5747 }, { "epoch": 0.7312046813382521, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.768360137939453, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8423386812210083, "num_tokens": 219330801.0, "step": 5748 }, { "epoch": 0.7313318916168426, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.722789764404297, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8560221791267395, "num_tokens": 219367470.0, "step": 5749 }, { "epoch": 0.7314591018954332, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.0360050201416, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8418969511985779, "num_tokens": 219405291.0, "step": 5750 }, { "epoch": 0.7315863121740237, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.88511085510254, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8574934005737305, "num_tokens": 219443365.0, "step": 5751 }, { "epoch": 0.7317135224526141, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.506460189819336, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8566195964813232, "num_tokens": 219477524.0, "step": 5752 }, { "epoch": 0.7318407327312046, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 32.00172805786133, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8457759022712708, "num_tokens": 219514576.0, "step": 5753 }, { "epoch": 0.7319679430097952, "ewc_loss": 0.07958984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.389617919921875e-05, "grad_norm": 31.531293869018555, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8556170463562012, "num_tokens": 219558253.0, "step": 5754 }, { "epoch": 0.7320951532883857, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.603622436523438, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8506539463996887, "num_tokens": 219601156.0, "step": 5755 }, { "epoch": 0.7322223635669762, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.335201263427734, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8446022868156433, "num_tokens": 219638241.0, "step": 5756 }, { "epoch": 0.7323495738455668, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.439453125, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8554831743240356, "num_tokens": 219678168.0, "step": 5757 }, { "epoch": 0.7324767841241572, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.521255493164062, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8537300825119019, "num_tokens": 219718115.0, "step": 5758 }, { "epoch": 0.7326039944027477, "ewc_loss": 0.080078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.437301635742188e-05, "grad_norm": 31.14974594116211, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8539013266563416, "num_tokens": 219754221.0, "step": 5759 }, { "epoch": 0.7327312046813382, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.399675369262695, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8610059022903442, "num_tokens": 219786880.0, "step": 5760 }, { "epoch": 0.7328584149599288, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.148834228515625, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8596414923667908, "num_tokens": 219830585.0, "step": 5761 }, { "epoch": 0.7329856252385193, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.579456329345703, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.857837438583374, "num_tokens": 219867249.0, "step": 5762 }, { "epoch": 0.7331128355171098, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.3496150970459, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8452526330947876, "num_tokens": 219906046.0, "step": 5763 }, { "epoch": 0.7332400457957003, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.24580192565918, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8460006713867188, "num_tokens": 219947466.0, "step": 5764 }, { "epoch": 0.7333672560742908, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.242042541503906, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8565709590911865, "num_tokens": 219985698.0, "step": 5765 }, { "epoch": 0.7334944663528813, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.397111892700195, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8554500341415405, "num_tokens": 220026158.0, "step": 5766 }, { "epoch": 0.7336216766314718, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.65384864807129, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8552089929580688, "num_tokens": 220071849.0, "step": 5767 }, { "epoch": 0.7337488869100623, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.286224365234375, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8618227243423462, "num_tokens": 220110255.0, "step": 5768 }, { "epoch": 0.7338760971886529, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.464033126831055, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8398940563201904, "num_tokens": 220144962.0, "step": 5769 }, { "epoch": 0.7340033074672434, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.13227653503418, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8454376459121704, "num_tokens": 220182118.0, "step": 5770 }, { "epoch": 0.7341305177458338, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.618074417114258, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8457627296447754, "num_tokens": 220218954.0, "step": 5771 }, { "epoch": 0.7342577280244243, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.18288803100586, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8535973429679871, "num_tokens": 220259116.0, "step": 5772 }, { "epoch": 0.7343849383030149, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.750139236450195, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8508503437042236, "num_tokens": 220301742.0, "step": 5773 }, { "epoch": 0.7345121485816054, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.191123962402344, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.858667254447937, "num_tokens": 220342572.0, "step": 5774 }, { "epoch": 0.7346393588601959, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.90462303161621, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8599237203598022, "num_tokens": 220376373.0, "step": 5775 }, { "epoch": 0.7347665691387865, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.174848556518555, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.871242880821228, "num_tokens": 220413290.0, "step": 5776 }, { "epoch": 0.7348937794173769, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.054840087890625, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8508679270744324, "num_tokens": 220453219.0, "step": 5777 }, { "epoch": 0.7350209896959674, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.30620574951172, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8546251654624939, "num_tokens": 220488369.0, "step": 5778 }, { "epoch": 0.7351481999745579, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.785736083984375, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8593078255653381, "num_tokens": 220527798.0, "step": 5779 }, { "epoch": 0.7352754102531485, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.324474334716797, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8734759092330933, "num_tokens": 220565398.0, "step": 5780 }, { "epoch": 0.735402620531739, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.951231002807617, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8652041554450989, "num_tokens": 220603903.0, "step": 5781 }, { "epoch": 0.7355298308103295, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.07904624938965, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8371131420135498, "num_tokens": 220644168.0, "step": 5782 }, { "epoch": 0.7356570410889199, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.825485229492188, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.867139458656311, "num_tokens": 220679709.0, "step": 5783 }, { "epoch": 0.7357842513675105, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.995452880859375, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8490384221076965, "num_tokens": 220721184.0, "step": 5784 }, { "epoch": 0.735911461646101, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.807735443115234, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8581013679504395, "num_tokens": 220755027.0, "step": 5785 }, { "epoch": 0.7360386719246915, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.078327178955078, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8301221132278442, "num_tokens": 220793597.0, "step": 5786 }, { "epoch": 0.736165882203282, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.15949249267578, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8543956279754639, "num_tokens": 220833549.0, "step": 5787 }, { "epoch": 0.7362930924818726, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.080150604248047, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8608154058456421, "num_tokens": 220865551.0, "step": 5788 }, { "epoch": 0.736420302760463, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.19178771972656, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8674774765968323, "num_tokens": 220911626.0, "step": 5789 }, { "epoch": 0.7365475130390535, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.38425064086914, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8559949398040771, "num_tokens": 220949839.0, "step": 5790 }, { "epoch": 0.736674723317644, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.99676513671875, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8386392593383789, "num_tokens": 220984069.0, "step": 5791 }, { "epoch": 0.7368019335962346, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.574451446533203, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8755671977996826, "num_tokens": 221020999.0, "step": 5792 }, { "epoch": 0.7369291438748251, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.56620979309082, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8447487354278564, "num_tokens": 221051082.0, "step": 5793 }, { "epoch": 0.7370563541534156, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.550926208496094, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8374612331390381, "num_tokens": 221095444.0, "step": 5794 }, { "epoch": 0.737183564432006, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.893787384033203, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8575088381767273, "num_tokens": 221136433.0, "step": 5795 }, { "epoch": 0.7373107747105966, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.242950439453125, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8471764922142029, "num_tokens": 221171278.0, "step": 5796 }, { "epoch": 0.7374379849891871, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 32.04204177856445, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8521234393119812, "num_tokens": 221209380.0, "step": 5797 }, { "epoch": 0.7375651952677776, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.527307510375977, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.853150486946106, "num_tokens": 221253076.0, "step": 5798 }, { "epoch": 0.7376924055463682, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.899639129638672, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8497292995452881, "num_tokens": 221290348.0, "step": 5799 }, { "epoch": 0.7378196158249587, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.53845977783203, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8606948852539062, "num_tokens": 221329635.0, "step": 5800 }, { "epoch": 0.7379468261035491, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.37485694885254, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8623772859573364, "num_tokens": 221368521.0, "step": 5801 }, { "epoch": 0.7380740363821396, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.324365615844727, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.86738121509552, "num_tokens": 221407090.0, "step": 5802 }, { "epoch": 0.7382012466607302, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.62518310546875, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8647398948669434, "num_tokens": 221450340.0, "step": 5803 }, { "epoch": 0.7383284569393207, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.918304443359375, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8551574945449829, "num_tokens": 221491996.0, "step": 5804 }, { "epoch": 0.7384556672179112, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.54231071472168, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8633320331573486, "num_tokens": 221531466.0, "step": 5805 }, { "epoch": 0.7385828774965018, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.835205078125, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8349946737289429, "num_tokens": 221567003.0, "step": 5806 }, { "epoch": 0.7387100877750922, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.78892707824707, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8676433563232422, "num_tokens": 221607396.0, "step": 5807 }, { "epoch": 0.7388372980536827, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.41765022277832, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8472235798835754, "num_tokens": 221644776.0, "step": 5808 }, { "epoch": 0.7389645083322732, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.872690200805664, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8685804009437561, "num_tokens": 221685643.0, "step": 5809 }, { "epoch": 0.7390917186108638, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.25309944152832, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8492766618728638, "num_tokens": 221724673.0, "step": 5810 }, { "epoch": 0.7392189288894543, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.691797256469727, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8304898738861084, "num_tokens": 221766287.0, "step": 5811 }, { "epoch": 0.7393461391680448, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.228910446166992, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8538403511047363, "num_tokens": 221805357.0, "step": 5812 }, { "epoch": 0.7394733494466353, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.782917022705078, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8526158928871155, "num_tokens": 221838513.0, "step": 5813 }, { "epoch": 0.7396005597252258, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.6411075592041, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8435806035995483, "num_tokens": 221882808.0, "step": 5814 }, { "epoch": 0.7397277700038163, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 32.07981491088867, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8672616481781006, "num_tokens": 221918505.0, "step": 5815 }, { "epoch": 0.7398549802824068, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.256921768188477, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8564643859863281, "num_tokens": 221955983.0, "step": 5816 }, { "epoch": 0.7399821905609973, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.897035598754883, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8540173768997192, "num_tokens": 221997363.0, "step": 5817 }, { "epoch": 0.7401094008395879, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.788990020751953, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.840418815612793, "num_tokens": 222039302.0, "step": 5818 }, { "epoch": 0.7402366111181784, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.86292839050293, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.853633463382721, "num_tokens": 222075779.0, "step": 5819 }, { "epoch": 0.7403638213967688, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.36105728149414, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8732825517654419, "num_tokens": 222115120.0, "step": 5820 }, { "epoch": 0.7404910316753593, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.889299392700195, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8477049469947815, "num_tokens": 222156570.0, "step": 5821 }, { "epoch": 0.7406182419539499, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.634313583374023, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8545604348182678, "num_tokens": 222189487.0, "step": 5822 }, { "epoch": 0.7407454522325404, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.707176208496094, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8533772230148315, "num_tokens": 222223163.0, "step": 5823 }, { "epoch": 0.7408726625111309, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.817163467407227, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8664637804031372, "num_tokens": 222259042.0, "step": 5824 }, { "epoch": 0.7409998727897215, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.911664962768555, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8354370594024658, "num_tokens": 222300497.0, "step": 5825 }, { "epoch": 0.7411270830683119, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.680362701416016, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8494299650192261, "num_tokens": 222333630.0, "step": 5826 }, { "epoch": 0.7412542933469024, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.97406768798828, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8503952026367188, "num_tokens": 222371715.0, "step": 5827 }, { "epoch": 0.7413815036254929, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.468955993652344, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8556122183799744, "num_tokens": 222412044.0, "step": 5828 }, { "epoch": 0.7415087139040835, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 32.54875564575195, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8573068380355835, "num_tokens": 222452668.0, "step": 5829 }, { "epoch": 0.741635924182674, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.75393295288086, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8569874167442322, "num_tokens": 222485631.0, "step": 5830 }, { "epoch": 0.7417631344612645, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.82988929748535, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8697469234466553, "num_tokens": 222522967.0, "step": 5831 }, { "epoch": 0.7418903447398549, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.588529586791992, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8447879552841187, "num_tokens": 222563258.0, "step": 5832 }, { "epoch": 0.7420175550184455, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 32.340335845947266, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8482328057289124, "num_tokens": 222599248.0, "step": 5833 }, { "epoch": 0.742144765297036, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.679733276367188, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8603985905647278, "num_tokens": 222642575.0, "step": 5834 }, { "epoch": 0.7422719755756265, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 32.120182037353516, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8306599855422974, "num_tokens": 222687330.0, "step": 5835 }, { "epoch": 0.742399185854217, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.336017608642578, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.829453706741333, "num_tokens": 222720020.0, "step": 5836 }, { "epoch": 0.7425263961328076, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.919475555419922, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8681744337081909, "num_tokens": 222758634.0, "step": 5837 }, { "epoch": 0.742653606411398, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 32.07767105102539, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8623461723327637, "num_tokens": 222794823.0, "step": 5838 }, { "epoch": 0.7427808166899885, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.7476863861084, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8541899919509888, "num_tokens": 222833836.0, "step": 5839 }, { "epoch": 0.742908026968579, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.74553680419922, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8575224876403809, "num_tokens": 222875712.0, "step": 5840 }, { "epoch": 0.7430352372471696, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 32.323760986328125, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8444156646728516, "num_tokens": 222915494.0, "step": 5841 }, { "epoch": 0.7431624475257601, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.193979263305664, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8719605207443237, "num_tokens": 222951675.0, "step": 5842 }, { "epoch": 0.7432896578043506, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 32.3170166015625, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8509799838066101, "num_tokens": 222993245.0, "step": 5843 }, { "epoch": 0.743416868082941, "ewc_loss": 0.08056640625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.4849853515625e-05, "grad_norm": 31.396638870239258, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8610973358154297, "num_tokens": 223029876.0, "step": 5844 }, { "epoch": 0.7435440783615316, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 32.13811111450195, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8604693412780762, "num_tokens": 223072550.0, "step": 5845 }, { "epoch": 0.7436712886401221, "ewc_loss": 0.08154296875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 31.26116180419922, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8522096872329712, "num_tokens": 223112892.0, "step": 5846 }, { "epoch": 0.7437984989187126, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.739206314086914, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8609696626663208, "num_tokens": 223157093.0, "step": 5847 }, { "epoch": 0.7439257091973032, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.382022857666016, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8458641171455383, "num_tokens": 223190756.0, "step": 5848 }, { "epoch": 0.7440529194758937, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.997589111328125, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8454172611236572, "num_tokens": 223226013.0, "step": 5849 }, { "epoch": 0.7441801297544841, "ewc_loss": 0.0810546875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 31.102970123291016, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8671661615371704, "num_tokens": 223262858.0, "step": 5850 }, { "epoch": 0.7443073400330746, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.575634002685547, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8595177531242371, "num_tokens": 223310528.0, "step": 5851 }, { "epoch": 0.7444345503116652, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.350757598876953, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8553295135498047, "num_tokens": 223345021.0, "step": 5852 }, { "epoch": 0.7445617605902557, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.46889305114746, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8644536733627319, "num_tokens": 223382055.0, "step": 5853 }, { "epoch": 0.7446889708688462, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.547117233276367, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8456791639328003, "num_tokens": 223418441.0, "step": 5854 }, { "epoch": 0.7448161811474368, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.216135025024414, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8531917333602905, "num_tokens": 223458150.0, "step": 5855 }, { "epoch": 0.7449433914260272, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.63951873779297, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8511831760406494, "num_tokens": 223496916.0, "step": 5856 }, { "epoch": 0.7450706017046177, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.29732894897461, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.860558271408081, "num_tokens": 223534808.0, "step": 5857 }, { "epoch": 0.7451978119832082, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.62255859375, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8651387691497803, "num_tokens": 223571255.0, "step": 5858 }, { "epoch": 0.7453250222617988, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.281482696533203, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8595356345176697, "num_tokens": 223616608.0, "step": 5859 }, { "epoch": 0.7454522325403893, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.331209182739258, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8549280166625977, "num_tokens": 223656188.0, "step": 5860 }, { "epoch": 0.7455794428189798, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.313383102416992, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.8509566783905029, "num_tokens": 223693787.0, "step": 5861 }, { "epoch": 0.7457066530975703, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.07001495361328, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8670166730880737, "num_tokens": 223729350.0, "step": 5862 }, { "epoch": 0.7458338633761608, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.459632873535156, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8430572748184204, "num_tokens": 223770846.0, "step": 5863 }, { "epoch": 0.7459610736547513, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.148662567138672, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8542284965515137, "num_tokens": 223814194.0, "step": 5864 }, { "epoch": 0.7460882839333418, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.20526123046875, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8479604721069336, "num_tokens": 223849141.0, "step": 5865 }, { "epoch": 0.7462154942119323, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.21824073791504, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8766323328018188, "num_tokens": 223888654.0, "step": 5866 }, { "epoch": 0.7463427044905229, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.580669403076172, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.857856810092926, "num_tokens": 223924509.0, "step": 5867 }, { "epoch": 0.7464699147691134, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.395280838012695, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8406463265419006, "num_tokens": 223966191.0, "step": 5868 }, { "epoch": 0.7465971250477038, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.332799911499023, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8570662140846252, "num_tokens": 224001415.0, "step": 5869 }, { "epoch": 0.7467243353262943, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.196752548217773, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8611156344413757, "num_tokens": 224042820.0, "step": 5870 }, { "epoch": 0.7468515456048849, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.683223724365234, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8594712018966675, "num_tokens": 224078411.0, "step": 5871 }, { "epoch": 0.7469787558834754, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.97307586669922, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8627282381057739, "num_tokens": 224117136.0, "step": 5872 }, { "epoch": 0.7471059661620659, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.58136558532715, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8482840061187744, "num_tokens": 224155879.0, "step": 5873 }, { "epoch": 0.7472331764406565, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.109394073486328, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8655718564987183, "num_tokens": 224195186.0, "step": 5874 }, { "epoch": 0.7473603867192469, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.53325080871582, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8524853587150574, "num_tokens": 224235722.0, "step": 5875 }, { "epoch": 0.7474875969978374, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.049081802368164, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8627182245254517, "num_tokens": 224268461.0, "step": 5876 }, { "epoch": 0.7476148072764279, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.31334686279297, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8680054545402527, "num_tokens": 224302921.0, "step": 5877 }, { "epoch": 0.7477420175550185, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.24985694885254, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8557319641113281, "num_tokens": 224344081.0, "step": 5878 }, { "epoch": 0.747869227833609, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.07548713684082, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8426503539085388, "num_tokens": 224384464.0, "step": 5879 }, { "epoch": 0.7479964381121995, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.33818817138672, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8643040060997009, "num_tokens": 224425542.0, "step": 5880 }, { "epoch": 0.7481236483907899, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.000059127807617, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8723205924034119, "num_tokens": 224465342.0, "step": 5881 }, { "epoch": 0.7482508586693805, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.78603172302246, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8592677116394043, "num_tokens": 224504787.0, "step": 5882 }, { "epoch": 0.748378068947971, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.445920944213867, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8571047782897949, "num_tokens": 224548534.0, "step": 5883 }, { "epoch": 0.7485052792265615, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.22237777709961, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8469433784484863, "num_tokens": 224590251.0, "step": 5884 }, { "epoch": 0.748632489505152, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.391498565673828, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8652012348175049, "num_tokens": 224633745.0, "step": 5885 }, { "epoch": 0.7487596997837426, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.384765625, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8387510776519775, "num_tokens": 224671319.0, "step": 5886 }, { "epoch": 0.748886910062333, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.46514320373535, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8549737334251404, "num_tokens": 224712670.0, "step": 5887 }, { "epoch": 0.7490141203409235, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.37398338317871, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8529112339019775, "num_tokens": 224747840.0, "step": 5888 }, { "epoch": 0.749141330619514, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.597986221313477, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8664421439170837, "num_tokens": 224783274.0, "step": 5889 }, { "epoch": 0.7492685408981046, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.083656311035156, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8526402711868286, "num_tokens": 224820456.0, "step": 5890 }, { "epoch": 0.7493957511766951, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.788532257080078, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8598220348358154, "num_tokens": 224858569.0, "step": 5891 }, { "epoch": 0.7495229614552856, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.168773651123047, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8539925813674927, "num_tokens": 224901033.0, "step": 5892 }, { "epoch": 0.749650171733876, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.70897102355957, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8698984980583191, "num_tokens": 224934345.0, "step": 5893 }, { "epoch": 0.7497773820124666, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.51829719543457, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8476179838180542, "num_tokens": 224972479.0, "step": 5894 }, { "epoch": 0.7499045922910571, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.171037673950195, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8720495700836182, "num_tokens": 225014213.0, "step": 5895 }, { "epoch": 0.7500318025696476, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.646183013916016, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8621121644973755, "num_tokens": 225049929.0, "step": 5896 }, { "epoch": 0.7501590128482382, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.062217712402344, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8416317701339722, "num_tokens": 225086409.0, "step": 5897 }, { "epoch": 0.7502862231268287, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.65174674987793, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8374925255775452, "num_tokens": 225124443.0, "step": 5898 }, { "epoch": 0.7504134334054191, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.172094345092773, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8448018431663513, "num_tokens": 225166665.0, "step": 5899 }, { "epoch": 0.7505406436840096, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.521984100341797, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8314052820205688, "num_tokens": 225203841.0, "step": 5900 }, { "epoch": 0.7506678539626002, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.13930320739746, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8385085463523865, "num_tokens": 225234844.0, "step": 5901 }, { "epoch": 0.7507950642411907, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.770444869995117, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8354543447494507, "num_tokens": 225269828.0, "step": 5902 }, { "epoch": 0.7509222745197812, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.110111236572266, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8611178994178772, "num_tokens": 225312361.0, "step": 5903 }, { "epoch": 0.7510494847983717, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.329875946044922, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8447428345680237, "num_tokens": 225353708.0, "step": 5904 }, { "epoch": 0.7511766950769622, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.155344009399414, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8742611408233643, "num_tokens": 225399730.0, "step": 5905 }, { "epoch": 0.7513039053555527, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.623775482177734, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8487766981124878, "num_tokens": 225441836.0, "step": 5906 }, { "epoch": 0.7514311156341432, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.32973289489746, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8635350465774536, "num_tokens": 225471004.0, "step": 5907 }, { "epoch": 0.7515583259127337, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.524580001831055, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8361599445343018, "num_tokens": 225505789.0, "step": 5908 }, { "epoch": 0.7516855361913243, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.164051055908203, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8582263588905334, "num_tokens": 225541138.0, "step": 5909 }, { "epoch": 0.7518127464699148, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.870647430419922, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8580378293991089, "num_tokens": 225581695.0, "step": 5910 }, { "epoch": 0.7519399567485053, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.098846435546875, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8602476716041565, "num_tokens": 225619532.0, "step": 5911 }, { "epoch": 0.7520671670270958, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.889440536499023, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8470555543899536, "num_tokens": 225657403.0, "step": 5912 }, { "epoch": 0.7521943773056863, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.434885025024414, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8512360453605652, "num_tokens": 225693402.0, "step": 5913 }, { "epoch": 0.7523215875842768, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.720775604248047, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8541908860206604, "num_tokens": 225727208.0, "step": 5914 }, { "epoch": 0.7524487978628673, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.286951065063477, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8602285385131836, "num_tokens": 225762117.0, "step": 5915 }, { "epoch": 0.7525760081414579, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.962553024291992, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8386662006378174, "num_tokens": 225794737.0, "step": 5916 }, { "epoch": 0.7527032184200484, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.379695892333984, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8543585538864136, "num_tokens": 225829074.0, "step": 5917 }, { "epoch": 0.7528304286986388, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.63650894165039, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8676613569259644, "num_tokens": 225871344.0, "step": 5918 }, { "epoch": 0.7529576389772293, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.507959365844727, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8502663373947144, "num_tokens": 225911321.0, "step": 5919 }, { "epoch": 0.7530848492558199, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.505657196044922, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8633388876914978, "num_tokens": 225944613.0, "step": 5920 }, { "epoch": 0.7532120595344104, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.594518661499023, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8604274988174438, "num_tokens": 225989501.0, "step": 5921 }, { "epoch": 0.7533392698130009, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.4376163482666, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8575261235237122, "num_tokens": 226024984.0, "step": 5922 }, { "epoch": 0.7534664800915915, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.468055725097656, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8683382272720337, "num_tokens": 226063255.0, "step": 5923 }, { "epoch": 0.7535936903701819, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.997047424316406, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8430511951446533, "num_tokens": 226105538.0, "step": 5924 }, { "epoch": 0.7537209006487724, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.478097915649414, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8586467504501343, "num_tokens": 226137449.0, "step": 5925 }, { "epoch": 0.7538481109273629, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.602895736694336, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8489699363708496, "num_tokens": 226171078.0, "step": 5926 }, { "epoch": 0.7539753212059535, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.36534881591797, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8570739030838013, "num_tokens": 226208060.0, "step": 5927 }, { "epoch": 0.754102531484544, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.70213508605957, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8655897378921509, "num_tokens": 226245330.0, "step": 5928 }, { "epoch": 0.7542297417631345, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.157243728637695, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.875755786895752, "num_tokens": 226281974.0, "step": 5929 }, { "epoch": 0.7543569520417249, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.813243865966797, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8442622423171997, "num_tokens": 226323302.0, "step": 5930 }, { "epoch": 0.7544841623203155, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.26540184020996, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8430082201957703, "num_tokens": 226361346.0, "step": 5931 }, { "epoch": 0.754611372598906, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.53119659423828, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8519601821899414, "num_tokens": 226397744.0, "step": 5932 }, { "epoch": 0.7547385828774965, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.635038375854492, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8624129295349121, "num_tokens": 226436745.0, "step": 5933 }, { "epoch": 0.754865793156087, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.187053680419922, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8633614778518677, "num_tokens": 226470302.0, "step": 5934 }, { "epoch": 0.7549930034346776, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.679107666015625, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8570561408996582, "num_tokens": 226500090.0, "step": 5935 }, { "epoch": 0.755120213713268, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.50298500061035, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8573217391967773, "num_tokens": 226533359.0, "step": 5936 }, { "epoch": 0.7552474239918585, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.467235565185547, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8459479808807373, "num_tokens": 226567060.0, "step": 5937 }, { "epoch": 0.755374634270449, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.49141502380371, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8530983328819275, "num_tokens": 226604270.0, "step": 5938 }, { "epoch": 0.7555018445490396, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.35423469543457, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8516520261764526, "num_tokens": 226641310.0, "step": 5939 }, { "epoch": 0.7556290548276301, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.483552932739258, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8436708450317383, "num_tokens": 226678305.0, "step": 5940 }, { "epoch": 0.7557562651062206, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.407352447509766, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8626976013183594, "num_tokens": 226715628.0, "step": 5941 }, { "epoch": 0.755883475384811, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.2799015045166, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8671875, "num_tokens": 226755346.0, "step": 5942 }, { "epoch": 0.7560106856634016, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.523021697998047, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.843185544013977, "num_tokens": 226792243.0, "step": 5943 }, { "epoch": 0.7561378959419921, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.66448402404785, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8707967400550842, "num_tokens": 226829033.0, "step": 5944 }, { "epoch": 0.7562651062205826, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.784395217895508, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8689489364624023, "num_tokens": 226870645.0, "step": 5945 }, { "epoch": 0.7563923164991732, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.52017593383789, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8593120574951172, "num_tokens": 226902395.0, "step": 5946 }, { "epoch": 0.7565195267777637, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.784828186035156, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8623408675193787, "num_tokens": 226942620.0, "step": 5947 }, { "epoch": 0.7566467370563541, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.305400848388672, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8635454773902893, "num_tokens": 226976472.0, "step": 5948 }, { "epoch": 0.7567739473349446, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.805343627929688, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8465657234191895, "num_tokens": 227013626.0, "step": 5949 }, { "epoch": 0.7569011576135352, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.765823364257812, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8423143029212952, "num_tokens": 227049336.0, "step": 5950 }, { "epoch": 0.7570283678921257, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.70257568359375, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8779278993606567, "num_tokens": 227087371.0, "step": 5951 }, { "epoch": 0.7571555781707162, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.846939086914062, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.857266366481781, "num_tokens": 227129188.0, "step": 5952 }, { "epoch": 0.7572827884493067, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.47084617614746, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.848355770111084, "num_tokens": 227161807.0, "step": 5953 }, { "epoch": 0.7574099987278972, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.78787612915039, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8452706336975098, "num_tokens": 227198431.0, "step": 5954 }, { "epoch": 0.7575372090064877, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.694503784179688, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8745344281196594, "num_tokens": 227236046.0, "step": 5955 }, { "epoch": 0.7576644192850782, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.772769927978516, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8719096183776855, "num_tokens": 227272925.0, "step": 5956 }, { "epoch": 0.7577916295636687, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.880186080932617, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8601483702659607, "num_tokens": 227312311.0, "step": 5957 }, { "epoch": 0.7579188398422593, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.53009033203125, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8754760026931763, "num_tokens": 227349632.0, "step": 5958 }, { "epoch": 0.7580460501208498, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 32.0767936706543, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8544455766677856, "num_tokens": 227392982.0, "step": 5959 }, { "epoch": 0.7581732603994403, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.27216148376465, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.852423906326294, "num_tokens": 227431878.0, "step": 5960 }, { "epoch": 0.7583004706780307, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.14458465576172, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8455268740653992, "num_tokens": 227470401.0, "step": 5961 }, { "epoch": 0.7584276809566213, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.055198669433594, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8611165285110474, "num_tokens": 227507189.0, "step": 5962 }, { "epoch": 0.7585548912352118, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.463531494140625, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8591047525405884, "num_tokens": 227548070.0, "step": 5963 }, { "epoch": 0.7586821015138023, "ewc_loss": 0.08251953125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.301057815551758, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8617210984230042, "num_tokens": 227584360.0, "step": 5964 }, { "epoch": 0.7588093117923929, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.494468688964844, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8614568114280701, "num_tokens": 227618961.0, "step": 5965 }, { "epoch": 0.7589365220709834, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.347614288330078, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8569859266281128, "num_tokens": 227658304.0, "step": 5966 }, { "epoch": 0.7590637323495738, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.27349853515625, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8542110919952393, "num_tokens": 227701526.0, "step": 5967 }, { "epoch": 0.7591909426281643, "ewc_loss": 0.08203125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 31.27955436706543, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8617998361587524, "num_tokens": 227744930.0, "step": 5968 }, { "epoch": 0.7593181529067549, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.640552520751953, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8670268654823303, "num_tokens": 227780607.0, "step": 5969 }, { "epoch": 0.7594453631853454, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.86629867553711, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8528102040290833, "num_tokens": 227825742.0, "step": 5970 }, { "epoch": 0.7595725734639359, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.49687385559082, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8365539312362671, "num_tokens": 227865278.0, "step": 5971 }, { "epoch": 0.7596997837425264, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.983234405517578, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8530066013336182, "num_tokens": 227906912.0, "step": 5972 }, { "epoch": 0.7598269940211169, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.420137405395508, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8534010648727417, "num_tokens": 227947067.0, "step": 5973 }, { "epoch": 0.7599542042997074, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.709671020507812, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8389014005661011, "num_tokens": 227988563.0, "step": 5974 }, { "epoch": 0.7600814145782979, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.587596893310547, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8493331670761108, "num_tokens": 228028556.0, "step": 5975 }, { "epoch": 0.7602086248568884, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.701915740966797, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.86954265832901, "num_tokens": 228066265.0, "step": 5976 }, { "epoch": 0.760335835135479, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.623294830322266, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8532918691635132, "num_tokens": 228103732.0, "step": 5977 }, { "epoch": 0.7604630454140695, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.609779357910156, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8538631200790405, "num_tokens": 228149609.0, "step": 5978 }, { "epoch": 0.7605902556926599, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.568828582763672, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8592634797096252, "num_tokens": 228188462.0, "step": 5979 }, { "epoch": 0.7607174659712505, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.79462432861328, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8695470690727234, "num_tokens": 228229252.0, "step": 5980 }, { "epoch": 0.760844676249841, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.540285110473633, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8523075580596924, "num_tokens": 228267397.0, "step": 5981 }, { "epoch": 0.7609718865284315, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.509258270263672, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8635967373847961, "num_tokens": 228307561.0, "step": 5982 }, { "epoch": 0.761099096807022, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.487621307373047, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8580410480499268, "num_tokens": 228347667.0, "step": 5983 }, { "epoch": 0.7612263070856126, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.24553680419922, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8536111116409302, "num_tokens": 228386820.0, "step": 5984 }, { "epoch": 0.761353517364203, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.949888229370117, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8651361465454102, "num_tokens": 228427304.0, "step": 5985 }, { "epoch": 0.7614807276427935, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.33856964111328, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8460142016410828, "num_tokens": 228470779.0, "step": 5986 }, { "epoch": 0.761607937921384, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.850948333740234, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8723964691162109, "num_tokens": 228500454.0, "step": 5987 }, { "epoch": 0.7617351481999746, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.249608993530273, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8633604049682617, "num_tokens": 228537374.0, "step": 5988 }, { "epoch": 0.7618623584785651, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.89052963256836, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8580683469772339, "num_tokens": 228577215.0, "step": 5989 }, { "epoch": 0.7619895687571556, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.46883201599121, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8615710735321045, "num_tokens": 228611693.0, "step": 5990 }, { "epoch": 0.762116779035746, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.621604919433594, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8546913862228394, "num_tokens": 228644231.0, "step": 5991 }, { "epoch": 0.7622439893143366, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.568580627441406, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8675038814544678, "num_tokens": 228684776.0, "step": 5992 }, { "epoch": 0.7623711995929271, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.352771759033203, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8629844784736633, "num_tokens": 228724214.0, "step": 5993 }, { "epoch": 0.7624984098715176, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.780271530151367, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8563079833984375, "num_tokens": 228765098.0, "step": 5994 }, { "epoch": 0.7626256201501082, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.527063369750977, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8581459522247314, "num_tokens": 228802423.0, "step": 5995 }, { "epoch": 0.7627528304286987, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.536529541015625, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8461417555809021, "num_tokens": 228840218.0, "step": 5996 }, { "epoch": 0.7628800407072891, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.51768684387207, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8626832962036133, "num_tokens": 228882810.0, "step": 5997 }, { "epoch": 0.7630072509858796, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.218936920166016, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8632497787475586, "num_tokens": 228917664.0, "step": 5998 }, { "epoch": 0.7631344612644702, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.96147918701172, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8634982109069824, "num_tokens": 228951017.0, "step": 5999 }, { "epoch": 0.7632616715430607, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.18317985534668, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8583293557167053, "num_tokens": 228990949.0, "step": 6000 }, { "epoch": 0.7633888818216512, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.828691482543945, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8620446920394897, "num_tokens": 229026604.0, "step": 6001 }, { "epoch": 0.7635160921002417, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.27485466003418, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8510977029800415, "num_tokens": 229061810.0, "step": 6002 }, { "epoch": 0.7636433023788322, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.07805252075195, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8336465954780579, "num_tokens": 229107765.0, "step": 6003 }, { "epoch": 0.7637705126574227, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.359922409057617, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.844184935092926, "num_tokens": 229147822.0, "step": 6004 }, { "epoch": 0.7638977229360132, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.936038970947266, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.866280734539032, "num_tokens": 229184421.0, "step": 6005 }, { "epoch": 0.7640249332146037, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.126644134521484, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8551859259605408, "num_tokens": 229221737.0, "step": 6006 }, { "epoch": 0.7641521434931943, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.31310272216797, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8514981269836426, "num_tokens": 229254727.0, "step": 6007 }, { "epoch": 0.7642793537717848, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.374948501586914, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8667762279510498, "num_tokens": 229292648.0, "step": 6008 }, { "epoch": 0.7644065640503753, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.142547607421875, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8611557483673096, "num_tokens": 229335708.0, "step": 6009 }, { "epoch": 0.7645337743289657, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.392471313476562, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8639134168624878, "num_tokens": 229373791.0, "step": 6010 }, { "epoch": 0.7646609846075563, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.28889465332031, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8653616905212402, "num_tokens": 229409891.0, "step": 6011 }, { "epoch": 0.7647881948861468, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.791399002075195, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8587322235107422, "num_tokens": 229449859.0, "step": 6012 }, { "epoch": 0.7649154051647373, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.604061126708984, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8635590076446533, "num_tokens": 229484224.0, "step": 6013 }, { "epoch": 0.7650426154433279, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.87740707397461, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8521156311035156, "num_tokens": 229521104.0, "step": 6014 }, { "epoch": 0.7651698257219184, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.778385162353516, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8399943113327026, "num_tokens": 229558918.0, "step": 6015 }, { "epoch": 0.7652970360005088, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.590253829956055, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8434150218963623, "num_tokens": 229596931.0, "step": 6016 }, { "epoch": 0.7654242462790993, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.92685890197754, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8423688411712646, "num_tokens": 229633903.0, "step": 6017 }, { "epoch": 0.7655514565576899, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.690881729125977, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8547180891036987, "num_tokens": 229674224.0, "step": 6018 }, { "epoch": 0.7656786668362804, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.39088249206543, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8550019264221191, "num_tokens": 229710547.0, "step": 6019 }, { "epoch": 0.7658058771148709, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.930753707885742, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8725232481956482, "num_tokens": 229752020.0, "step": 6020 }, { "epoch": 0.7659330873934614, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.524669647216797, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.854799211025238, "num_tokens": 229785710.0, "step": 6021 }, { "epoch": 0.7660602976720519, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.826175689697266, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8640454411506653, "num_tokens": 229824118.0, "step": 6022 }, { "epoch": 0.7661875079506424, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.284969329833984, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8682029247283936, "num_tokens": 229854231.0, "step": 6023 }, { "epoch": 0.7663147182292329, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.13099670410156, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8473859429359436, "num_tokens": 229888102.0, "step": 6024 }, { "epoch": 0.7664419285078234, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.393251419067383, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8597066402435303, "num_tokens": 229927634.0, "step": 6025 }, { "epoch": 0.766569138786414, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.678836822509766, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8640371561050415, "num_tokens": 229966526.0, "step": 6026 }, { "epoch": 0.7666963490650045, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.412508010864258, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8667358756065369, "num_tokens": 230002761.0, "step": 6027 }, { "epoch": 0.7668235593435949, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.777978897094727, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8629165887832642, "num_tokens": 230041303.0, "step": 6028 }, { "epoch": 0.7669507696221854, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.3150691986084, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8616752624511719, "num_tokens": 230081117.0, "step": 6029 }, { "epoch": 0.767077979900776, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.504453659057617, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.85101318359375, "num_tokens": 230119411.0, "step": 6030 }, { "epoch": 0.7672051901793665, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.480728149414062, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8651933670043945, "num_tokens": 230161111.0, "step": 6031 }, { "epoch": 0.767332400457957, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.027442932128906, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8523123264312744, "num_tokens": 230196999.0, "step": 6032 }, { "epoch": 0.7674596107365476, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.276456832885742, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8589763045310974, "num_tokens": 230231495.0, "step": 6033 }, { "epoch": 0.767586821015138, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.85310935974121, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8504505753517151, "num_tokens": 230267732.0, "step": 6034 }, { "epoch": 0.7677140312937285, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.665374755859375, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8564576506614685, "num_tokens": 230297559.0, "step": 6035 }, { "epoch": 0.767841241572319, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.695083618164062, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8451922535896301, "num_tokens": 230336396.0, "step": 6036 }, { "epoch": 0.7679684518509096, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.58357810974121, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8570072650909424, "num_tokens": 230368152.0, "step": 6037 }, { "epoch": 0.7680956621295001, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.755088806152344, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8631673455238342, "num_tokens": 230408081.0, "step": 6038 }, { "epoch": 0.7682228724080906, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.544963836669922, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8503193855285645, "num_tokens": 230441691.0, "step": 6039 }, { "epoch": 0.768350082686681, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.497989654541016, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8540546894073486, "num_tokens": 230480089.0, "step": 6040 }, { "epoch": 0.7684772929652716, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.8951416015625, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8508149981498718, "num_tokens": 230518655.0, "step": 6041 }, { "epoch": 0.7686045032438621, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.347946166992188, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8598541617393494, "num_tokens": 230555717.0, "step": 6042 }, { "epoch": 0.7687317135224526, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.40201950073242, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8657592535018921, "num_tokens": 230596699.0, "step": 6043 }, { "epoch": 0.7688589238010431, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.26814079284668, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.865185022354126, "num_tokens": 230633284.0, "step": 6044 }, { "epoch": 0.7689861340796337, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.8637638092041, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8599848747253418, "num_tokens": 230673133.0, "step": 6045 }, { "epoch": 0.7691133443582241, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.563922882080078, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8581438064575195, "num_tokens": 230719003.0, "step": 6046 }, { "epoch": 0.7692405546368146, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.676361083984375, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8662564158439636, "num_tokens": 230754805.0, "step": 6047 }, { "epoch": 0.7693677649154052, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.58743667602539, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8446663618087769, "num_tokens": 230793249.0, "step": 6048 }, { "epoch": 0.7694949751939957, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.29227066040039, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8734599947929382, "num_tokens": 230826025.0, "step": 6049 }, { "epoch": 0.7696221854725862, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.65165901184082, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8698316216468811, "num_tokens": 230866615.0, "step": 6050 }, { "epoch": 0.7697493957511767, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.450542449951172, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8416704535484314, "num_tokens": 230901587.0, "step": 6051 }, { "epoch": 0.7698766060297672, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.849584579467773, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8631348609924316, "num_tokens": 230944267.0, "step": 6052 }, { "epoch": 0.7700038163083577, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.147506713867188, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8487805724143982, "num_tokens": 230980022.0, "step": 6053 }, { "epoch": 0.7701310265869482, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.96568489074707, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8637994527816772, "num_tokens": 231018110.0, "step": 6054 }, { "epoch": 0.7702582368655387, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.39836311340332, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8619478940963745, "num_tokens": 231056531.0, "step": 6055 }, { "epoch": 0.7703854471441293, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.870403289794922, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8635355830192566, "num_tokens": 231096270.0, "step": 6056 }, { "epoch": 0.7705126574227198, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.399490356445312, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8505800366401672, "num_tokens": 231136300.0, "step": 6057 }, { "epoch": 0.7706398677013102, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.115699768066406, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8438732028007507, "num_tokens": 231169005.0, "step": 6058 }, { "epoch": 0.7707670779799007, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.292423248291016, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8644661903381348, "num_tokens": 231205274.0, "step": 6059 }, { "epoch": 0.7708942882584913, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.040870666503906, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8623075485229492, "num_tokens": 231239022.0, "step": 6060 }, { "epoch": 0.7710214985370818, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.5932559967041, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8388538360595703, "num_tokens": 231282952.0, "step": 6061 }, { "epoch": 0.7711487088156723, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.704042434692383, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8542454242706299, "num_tokens": 231322980.0, "step": 6062 }, { "epoch": 0.7712759190942629, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.614479064941406, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8495647311210632, "num_tokens": 231358382.0, "step": 6063 }, { "epoch": 0.7714031293728534, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.551471710205078, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8654961585998535, "num_tokens": 231401915.0, "step": 6064 }, { "epoch": 0.7715303396514438, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.510833740234375, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8344035148620605, "num_tokens": 231433241.0, "step": 6065 }, { "epoch": 0.7716575499300343, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.8299617767334, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8522292971611023, "num_tokens": 231478318.0, "step": 6066 }, { "epoch": 0.7717847602086249, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.634931564331055, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8499479293823242, "num_tokens": 231518584.0, "step": 6067 }, { "epoch": 0.7719119704872154, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.58782386779785, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.856793999671936, "num_tokens": 231556689.0, "step": 6068 }, { "epoch": 0.7720391807658059, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.812267303466797, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8553533554077148, "num_tokens": 231593936.0, "step": 6069 }, { "epoch": 0.7721663910443964, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.401771545410156, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8760608434677124, "num_tokens": 231634785.0, "step": 6070 }, { "epoch": 0.7722936013229869, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.726518630981445, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8515044450759888, "num_tokens": 231675798.0, "step": 6071 }, { "epoch": 0.7724208116015774, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.701162338256836, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8541057109832764, "num_tokens": 231711611.0, "step": 6072 }, { "epoch": 0.7725480218801679, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.58070945739746, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8468067049980164, "num_tokens": 231746997.0, "step": 6073 }, { "epoch": 0.7726752321587584, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.68553924560547, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8370586633682251, "num_tokens": 231787696.0, "step": 6074 }, { "epoch": 0.772802442437349, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.638338088989258, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8477888107299805, "num_tokens": 231826127.0, "step": 6075 }, { "epoch": 0.7729296527159395, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.533802032470703, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8505706191062927, "num_tokens": 231857538.0, "step": 6076 }, { "epoch": 0.7730568629945299, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.50404167175293, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8544192910194397, "num_tokens": 231893913.0, "step": 6077 }, { "epoch": 0.7731840732731204, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.530855178833008, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8592700958251953, "num_tokens": 231934274.0, "step": 6078 }, { "epoch": 0.773311283551711, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.8033390045166, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8506420850753784, "num_tokens": 231983197.0, "step": 6079 }, { "epoch": 0.7734384938303015, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.617198944091797, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8521712422370911, "num_tokens": 232021265.0, "step": 6080 }, { "epoch": 0.773565704108892, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.69582748413086, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8544586896896362, "num_tokens": 232060992.0, "step": 6081 }, { "epoch": 0.7736929143874826, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.9083194732666, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8535698652267456, "num_tokens": 232096758.0, "step": 6082 }, { "epoch": 0.773820124666073, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.902196884155273, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8528959155082703, "num_tokens": 232133773.0, "step": 6083 }, { "epoch": 0.7739473349446635, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.948989868164062, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.85426265001297, "num_tokens": 232173182.0, "step": 6084 }, { "epoch": 0.774074545223254, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.614349365234375, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8642651438713074, "num_tokens": 232206478.0, "step": 6085 }, { "epoch": 0.7742017555018446, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.19260025024414, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8505854606628418, "num_tokens": 232242231.0, "step": 6086 }, { "epoch": 0.7743289657804351, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.366859436035156, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8586361408233643, "num_tokens": 232270636.0, "step": 6087 }, { "epoch": 0.7744561760590256, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.18331527709961, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8630200028419495, "num_tokens": 232309974.0, "step": 6088 }, { "epoch": 0.774583386337616, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.75005340576172, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8646408319473267, "num_tokens": 232344118.0, "step": 6089 }, { "epoch": 0.7747105966162066, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.06509017944336, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8454522490501404, "num_tokens": 232385183.0, "step": 6090 }, { "epoch": 0.7748378068947971, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.45238494873047, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8582512140274048, "num_tokens": 232421670.0, "step": 6091 }, { "epoch": 0.7749650171733876, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.2509765625, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.843620777130127, "num_tokens": 232459563.0, "step": 6092 }, { "epoch": 0.7750922274519781, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.675872802734375, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8567982912063599, "num_tokens": 232496357.0, "step": 6093 }, { "epoch": 0.7752194377305687, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.948780059814453, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.856127142906189, "num_tokens": 232530436.0, "step": 6094 }, { "epoch": 0.7753466480091591, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.778188705444336, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8499734997749329, "num_tokens": 232568585.0, "step": 6095 }, { "epoch": 0.7754738582877496, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.98252296447754, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8629401922225952, "num_tokens": 232602506.0, "step": 6096 }, { "epoch": 0.7756010685663401, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.529691696166992, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8634670376777649, "num_tokens": 232642819.0, "step": 6097 }, { "epoch": 0.7757282788449307, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.061405181884766, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8449543714523315, "num_tokens": 232679505.0, "step": 6098 }, { "epoch": 0.7758554891235212, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.58062171936035, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8612970113754272, "num_tokens": 232715701.0, "step": 6099 }, { "epoch": 0.7759826994021117, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.05868911743164, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8573170304298401, "num_tokens": 232750786.0, "step": 6100 }, { "epoch": 0.7761099096807021, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.696950912475586, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8452361822128296, "num_tokens": 232791370.0, "step": 6101 }, { "epoch": 0.7762371199592927, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.4322624206543, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8575284481048584, "num_tokens": 232830895.0, "step": 6102 }, { "epoch": 0.7763643302378832, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.680999755859375, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8558753132820129, "num_tokens": 232869940.0, "step": 6103 }, { "epoch": 0.7764915405164737, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.94851303100586, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8524067401885986, "num_tokens": 232910743.0, "step": 6104 }, { "epoch": 0.7766187507950643, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.38726806640625, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8565463423728943, "num_tokens": 232948428.0, "step": 6105 }, { "epoch": 0.7767459610736548, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.49889373779297, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8616040945053101, "num_tokens": 232985142.0, "step": 6106 }, { "epoch": 0.7768731713522452, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.57776641845703, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8650990724563599, "num_tokens": 233020810.0, "step": 6107 }, { "epoch": 0.7770003816308357, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.6289119720459, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.848388671875, "num_tokens": 233055674.0, "step": 6108 }, { "epoch": 0.7771275919094263, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.9783992767334, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.851378321647644, "num_tokens": 233084506.0, "step": 6109 }, { "epoch": 0.7772548021880168, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.10002899169922, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.872779369354248, "num_tokens": 233120172.0, "step": 6110 }, { "epoch": 0.7773820124666073, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.149105072021484, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8463103771209717, "num_tokens": 233160075.0, "step": 6111 }, { "epoch": 0.7775092227451978, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.45624351501465, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8494457602500916, "num_tokens": 233201714.0, "step": 6112 }, { "epoch": 0.7776364330237884, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.06648635864258, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8626111745834351, "num_tokens": 233237642.0, "step": 6113 }, { "epoch": 0.7777636433023788, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.785873413085938, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8617959022521973, "num_tokens": 233279712.0, "step": 6114 }, { "epoch": 0.7778908535809693, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.19216537475586, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.862136960029602, "num_tokens": 233322441.0, "step": 6115 }, { "epoch": 0.7780180638595598, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.583606719970703, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8508864641189575, "num_tokens": 233371053.0, "step": 6116 }, { "epoch": 0.7781452741381504, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.097572326660156, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8580818176269531, "num_tokens": 233405372.0, "step": 6117 }, { "epoch": 0.7782724844167409, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.889944076538086, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8631421327590942, "num_tokens": 233447209.0, "step": 6118 }, { "epoch": 0.7783996946953314, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.814008712768555, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8598015904426575, "num_tokens": 233484621.0, "step": 6119 }, { "epoch": 0.7785269049739219, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 32.24469757080078, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8390352725982666, "num_tokens": 233520756.0, "step": 6120 }, { "epoch": 0.7786541152525124, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.462812423706055, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8485161066055298, "num_tokens": 233559120.0, "step": 6121 }, { "epoch": 0.7787813255311029, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.538330078125, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8674106001853943, "num_tokens": 233590513.0, "step": 6122 }, { "epoch": 0.7789085358096934, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.416627883911133, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8737533092498779, "num_tokens": 233625986.0, "step": 6123 }, { "epoch": 0.779035746088284, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.14539337158203, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8651410341262817, "num_tokens": 233655097.0, "step": 6124 }, { "epoch": 0.7791629563668745, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.600854873657227, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8472082018852234, "num_tokens": 233687815.0, "step": 6125 }, { "epoch": 0.7792901666454649, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.006954193115234, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8705098628997803, "num_tokens": 233722611.0, "step": 6126 }, { "epoch": 0.7794173769240554, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.711204528808594, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8437932729721069, "num_tokens": 233765345.0, "step": 6127 }, { "epoch": 0.779544587202646, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.126277923583984, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8589281439781189, "num_tokens": 233804358.0, "step": 6128 }, { "epoch": 0.7796717974812365, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.11672592163086, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8466984629631042, "num_tokens": 233845117.0, "step": 6129 }, { "epoch": 0.779799007759827, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.892240524291992, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8486207723617554, "num_tokens": 233884091.0, "step": 6130 }, { "epoch": 0.7799262180384176, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.542449951171875, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.854117751121521, "num_tokens": 233922590.0, "step": 6131 }, { "epoch": 0.780053428317008, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.511302947998047, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8588074445724487, "num_tokens": 233954138.0, "step": 6132 }, { "epoch": 0.7801806385955985, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.3087272644043, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8659058809280396, "num_tokens": 233988035.0, "step": 6133 }, { "epoch": 0.780307848874189, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.49986457824707, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8465993404388428, "num_tokens": 234023747.0, "step": 6134 }, { "epoch": 0.7804350591527796, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.14820098876953, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8651899695396423, "num_tokens": 234064182.0, "step": 6135 }, { "epoch": 0.7805622694313701, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.755998611450195, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.840407133102417, "num_tokens": 234106533.0, "step": 6136 }, { "epoch": 0.7806894797099606, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.999874114990234, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8693116903305054, "num_tokens": 234145492.0, "step": 6137 }, { "epoch": 0.780816689988551, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.830751419067383, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8595312237739563, "num_tokens": 234181709.0, "step": 6138 }, { "epoch": 0.7809439002671416, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.944786071777344, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8560692071914673, "num_tokens": 234217854.0, "step": 6139 }, { "epoch": 0.7810711105457321, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.833372116088867, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8566999435424805, "num_tokens": 234255769.0, "step": 6140 }, { "epoch": 0.7811983208243226, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.66083335876465, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8425729274749756, "num_tokens": 234290828.0, "step": 6141 }, { "epoch": 0.7813255311029131, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.89948272705078, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8559263348579407, "num_tokens": 234326336.0, "step": 6142 }, { "epoch": 0.7814527413815037, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.479612350463867, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8513547778129578, "num_tokens": 234366210.0, "step": 6143 }, { "epoch": 0.7815799516600941, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.849220275878906, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8481511473655701, "num_tokens": 234398975.0, "step": 6144 }, { "epoch": 0.7817071619386846, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.75728416442871, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8511724472045898, "num_tokens": 234437746.0, "step": 6145 }, { "epoch": 0.7818343722172751, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.485321044921875, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8611081838607788, "num_tokens": 234478402.0, "step": 6146 }, { "epoch": 0.7819615824958657, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.747617721557617, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8413240313529968, "num_tokens": 234509699.0, "step": 6147 }, { "epoch": 0.7820887927744562, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.74668312072754, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8510304689407349, "num_tokens": 234551515.0, "step": 6148 }, { "epoch": 0.7822160030530467, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.511869430541992, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.863347589969635, "num_tokens": 234582811.0, "step": 6149 }, { "epoch": 0.7823432133316371, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.85330581665039, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8496224284172058, "num_tokens": 234625299.0, "step": 6150 }, { "epoch": 0.7824704236102277, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.744531631469727, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8627132177352905, "num_tokens": 234665641.0, "step": 6151 }, { "epoch": 0.7825976338888182, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.96268653869629, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8558538556098938, "num_tokens": 234704668.0, "step": 6152 }, { "epoch": 0.7827248441674087, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.835908889770508, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8595809936523438, "num_tokens": 234738080.0, "step": 6153 }, { "epoch": 0.7828520544459993, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.935392379760742, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8531467914581299, "num_tokens": 234781077.0, "step": 6154 }, { "epoch": 0.7829792647245898, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.16851806640625, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8408341407775879, "num_tokens": 234809374.0, "step": 6155 }, { "epoch": 0.7831064750031802, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.991443634033203, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8436933159828186, "num_tokens": 234845585.0, "step": 6156 }, { "epoch": 0.7832336852817707, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.245052337646484, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8560588359832764, "num_tokens": 234883891.0, "step": 6157 }, { "epoch": 0.7833608955603613, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 32.06182861328125, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.85528564453125, "num_tokens": 234919445.0, "step": 6158 }, { "epoch": 0.7834881058389518, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.297908782958984, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8586118221282959, "num_tokens": 234951465.0, "step": 6159 }, { "epoch": 0.7836153161175423, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.66793441772461, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8473306894302368, "num_tokens": 234989970.0, "step": 6160 }, { "epoch": 0.7837425263961328, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.18971252441406, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8563744425773621, "num_tokens": 235029673.0, "step": 6161 }, { "epoch": 0.7838697366747234, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.28635025024414, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8556495904922485, "num_tokens": 235065389.0, "step": 6162 }, { "epoch": 0.7839969469533138, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.69093894958496, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8663904070854187, "num_tokens": 235099795.0, "step": 6163 }, { "epoch": 0.7841241572319043, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.107078552246094, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8565084934234619, "num_tokens": 235137475.0, "step": 6164 }, { "epoch": 0.7842513675104948, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.733867645263672, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8522114157676697, "num_tokens": 235178454.0, "step": 6165 }, { "epoch": 0.7843785777890854, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.700868606567383, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8741581439971924, "num_tokens": 235216139.0, "step": 6166 }, { "epoch": 0.7845057880676759, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.552701950073242, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8700991868972778, "num_tokens": 235253571.0, "step": 6167 }, { "epoch": 0.7846329983462664, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.818506240844727, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8406769037246704, "num_tokens": 235293950.0, "step": 6168 }, { "epoch": 0.7847602086248568, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.75714683532715, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8602256774902344, "num_tokens": 235336419.0, "step": 6169 }, { "epoch": 0.7848874189034474, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.064918518066406, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8619124293327332, "num_tokens": 235376039.0, "step": 6170 }, { "epoch": 0.7850146291820379, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.714462280273438, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8440725207328796, "num_tokens": 235412736.0, "step": 6171 }, { "epoch": 0.7851418394606284, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.610448837280273, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.857099711894989, "num_tokens": 235457318.0, "step": 6172 }, { "epoch": 0.785269049739219, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.15861511230469, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8460647463798523, "num_tokens": 235491471.0, "step": 6173 }, { "epoch": 0.7853962600178095, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.53849983215332, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8671907782554626, "num_tokens": 235531324.0, "step": 6174 }, { "epoch": 0.7855234702963999, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.978107452392578, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8431580066680908, "num_tokens": 235573233.0, "step": 6175 }, { "epoch": 0.7856506805749904, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.367420196533203, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8584692478179932, "num_tokens": 235614400.0, "step": 6176 }, { "epoch": 0.785777890853581, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.96633529663086, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8640109300613403, "num_tokens": 235652247.0, "step": 6177 }, { "epoch": 0.7859051011321715, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.50969123840332, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8517152070999146, "num_tokens": 235681810.0, "step": 6178 }, { "epoch": 0.786032311410762, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 33.099857330322266, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8652225732803345, "num_tokens": 235718422.0, "step": 6179 }, { "epoch": 0.7861595216893525, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.363752365112305, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.851447582244873, "num_tokens": 235758886.0, "step": 6180 }, { "epoch": 0.786286731967943, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.20164489746094, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8598352670669556, "num_tokens": 235792270.0, "step": 6181 }, { "epoch": 0.7864139422465335, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.708446502685547, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8465381264686584, "num_tokens": 235826678.0, "step": 6182 }, { "epoch": 0.786541152525124, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.936174392700195, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8682211637496948, "num_tokens": 235860437.0, "step": 6183 }, { "epoch": 0.7866683628037145, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.93307113647461, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8598735928535461, "num_tokens": 235900865.0, "step": 6184 }, { "epoch": 0.7867955730823051, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.16619873046875, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8420872688293457, "num_tokens": 235937419.0, "step": 6185 }, { "epoch": 0.7869227833608956, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.34033203125, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8473144769668579, "num_tokens": 235976425.0, "step": 6186 }, { "epoch": 0.787049993639486, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.906570434570312, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8501776456832886, "num_tokens": 236012968.0, "step": 6187 }, { "epoch": 0.7871772039180766, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.82306671142578, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8579535484313965, "num_tokens": 236052462.0, "step": 6188 }, { "epoch": 0.7873044141966671, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.181819915771484, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8721742630004883, "num_tokens": 236087604.0, "step": 6189 }, { "epoch": 0.7874316244752576, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.05198287963867, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8552283644676208, "num_tokens": 236126344.0, "step": 6190 }, { "epoch": 0.7875588347538481, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.22898864746094, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8642545938491821, "num_tokens": 236161913.0, "step": 6191 }, { "epoch": 0.7876860450324387, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.856761932373047, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8583935499191284, "num_tokens": 236200612.0, "step": 6192 }, { "epoch": 0.7878132553110291, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.05368423461914, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8622298836708069, "num_tokens": 236238136.0, "step": 6193 }, { "epoch": 0.7879404655896196, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.922191619873047, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8532862067222595, "num_tokens": 236281303.0, "step": 6194 }, { "epoch": 0.7880676758682101, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.5350341796875, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8539935350418091, "num_tokens": 236322355.0, "step": 6195 }, { "epoch": 0.7881948861468007, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.82476234436035, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8544679284095764, "num_tokens": 236354263.0, "step": 6196 }, { "epoch": 0.7883220964253912, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 32.44243621826172, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8573187589645386, "num_tokens": 236391581.0, "step": 6197 }, { "epoch": 0.7884493067039817, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.838348388671875, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8648154735565186, "num_tokens": 236426003.0, "step": 6198 }, { "epoch": 0.7885765169825721, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.26094436645508, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8652791976928711, "num_tokens": 236464874.0, "step": 6199 }, { "epoch": 0.7887037272611627, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.749164581298828, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.866938054561615, "num_tokens": 236499826.0, "step": 6200 }, { "epoch": 0.7888309375397532, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 32.51205062866211, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8680295944213867, "num_tokens": 236538615.0, "step": 6201 }, { "epoch": 0.7889581478183437, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 32.375396728515625, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8554979562759399, "num_tokens": 236577860.0, "step": 6202 }, { "epoch": 0.7890853580969343, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 32.12488555908203, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.871247410774231, "num_tokens": 236619041.0, "step": 6203 }, { "epoch": 0.7892125683755248, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.782073974609375, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8599522113800049, "num_tokens": 236655788.0, "step": 6204 }, { "epoch": 0.7893397786541152, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.575111389160156, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8695850372314453, "num_tokens": 236691532.0, "step": 6205 }, { "epoch": 0.7894669889327057, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.77857208251953, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.864315390586853, "num_tokens": 236733792.0, "step": 6206 }, { "epoch": 0.7895941992112963, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.074058532714844, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8687131404876709, "num_tokens": 236775454.0, "step": 6207 }, { "epoch": 0.7897214094898868, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.837602615356445, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8661646842956543, "num_tokens": 236811572.0, "step": 6208 }, { "epoch": 0.7898486197684773, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.977724075317383, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8425264358520508, "num_tokens": 236851018.0, "step": 6209 }, { "epoch": 0.7899758300470678, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.831628799438477, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8610901832580566, "num_tokens": 236891871.0, "step": 6210 }, { "epoch": 0.7901030403256584, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.14745330810547, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8592745065689087, "num_tokens": 236926771.0, "step": 6211 }, { "epoch": 0.7902302506042488, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.943880081176758, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8442963361740112, "num_tokens": 236964768.0, "step": 6212 }, { "epoch": 0.7903574608828393, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.92856216430664, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8451268076896667, "num_tokens": 237005075.0, "step": 6213 }, { "epoch": 0.7904846711614298, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.13576126098633, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8582945466041565, "num_tokens": 237035517.0, "step": 6214 }, { "epoch": 0.7906118814400204, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.639102935791016, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8742155432701111, "num_tokens": 237076319.0, "step": 6215 }, { "epoch": 0.7907390917186109, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 32.37702941894531, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8478934168815613, "num_tokens": 237117935.0, "step": 6216 }, { "epoch": 0.7908663019972014, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.720935821533203, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8660599589347839, "num_tokens": 237155789.0, "step": 6217 }, { "epoch": 0.7909935122757918, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.132057189941406, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8625434041023254, "num_tokens": 237198870.0, "step": 6218 }, { "epoch": 0.7911207225543824, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.944013595581055, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8618764877319336, "num_tokens": 237243084.0, "step": 6219 }, { "epoch": 0.7912479328329729, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.95819854736328, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8483908176422119, "num_tokens": 237277245.0, "step": 6220 }, { "epoch": 0.7913751431115634, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.866701126098633, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8612120747566223, "num_tokens": 237317841.0, "step": 6221 }, { "epoch": 0.791502353390154, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.832183837890625, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8462063074111938, "num_tokens": 237349327.0, "step": 6222 }, { "epoch": 0.7916295636687445, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.913000106811523, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8428956270217896, "num_tokens": 237396179.0, "step": 6223 }, { "epoch": 0.7917567739473349, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.761255264282227, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8537405729293823, "num_tokens": 237437766.0, "step": 6224 }, { "epoch": 0.7918839842259254, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.794832229614258, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8614038228988647, "num_tokens": 237479239.0, "step": 6225 }, { "epoch": 0.792011194504516, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.906898498535156, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8702514171600342, "num_tokens": 237514641.0, "step": 6226 }, { "epoch": 0.7921384047831065, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.8597412109375, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.868550717830658, "num_tokens": 237555319.0, "step": 6227 }, { "epoch": 0.792265615061697, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.86923599243164, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8595032691955566, "num_tokens": 237597786.0, "step": 6228 }, { "epoch": 0.7923928253402875, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.82396125793457, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8699881434440613, "num_tokens": 237636584.0, "step": 6229 }, { "epoch": 0.792520035618878, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.968000411987305, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8482894897460938, "num_tokens": 237676000.0, "step": 6230 }, { "epoch": 0.7926472458974685, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.029884338378906, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8830969333648682, "num_tokens": 237713003.0, "step": 6231 }, { "epoch": 0.792774456176059, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.798500061035156, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8602449893951416, "num_tokens": 237742054.0, "step": 6232 }, { "epoch": 0.7929016664546495, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.9033145904541, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8485557436943054, "num_tokens": 237783147.0, "step": 6233 }, { "epoch": 0.7930288767332401, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.93036460876465, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8503396511077881, "num_tokens": 237814322.0, "step": 6234 }, { "epoch": 0.7931560870118306, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5854835510253906e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.744964599609375, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8622089624404907, "num_tokens": 237849278.0, "step": 6235 }, { "epoch": 0.793283297290421, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.036190032958984, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8611364960670471, "num_tokens": 237893434.0, "step": 6236 }, { "epoch": 0.7934105075690115, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.738525390625, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8648786544799805, "num_tokens": 237932135.0, "step": 6237 }, { "epoch": 0.7935377178476021, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.2733039855957, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8523195385932922, "num_tokens": 237966320.0, "step": 6238 }, { "epoch": 0.7936649281261926, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.83248519897461, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8625880479812622, "num_tokens": 238006365.0, "step": 6239 }, { "epoch": 0.7937921384047831, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.20626449584961, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8531209230422974, "num_tokens": 238039307.0, "step": 6240 }, { "epoch": 0.7939193486833737, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.841251373291016, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8437997102737427, "num_tokens": 238078015.0, "step": 6241 }, { "epoch": 0.7940465589619641, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.17977523803711, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.854110836982727, "num_tokens": 238116790.0, "step": 6242 }, { "epoch": 0.7941737692405546, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.02096939086914, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8643798828125, "num_tokens": 238155330.0, "step": 6243 }, { "epoch": 0.7943009795191451, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 32.21330261230469, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8418499231338501, "num_tokens": 238194492.0, "step": 6244 }, { "epoch": 0.7944281897977357, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.876365661621094, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8563368320465088, "num_tokens": 238230451.0, "step": 6245 }, { "epoch": 0.7945554000763262, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.967084884643555, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8647995591163635, "num_tokens": 238266331.0, "step": 6246 }, { "epoch": 0.7946826103549167, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.75444793701172, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8457147479057312, "num_tokens": 238307753.0, "step": 6247 }, { "epoch": 0.7948098206335071, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.888004302978516, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.857679009437561, "num_tokens": 238349644.0, "step": 6248 }, { "epoch": 0.7949370309120977, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.8851375579834, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8430353403091431, "num_tokens": 238386308.0, "step": 6249 }, { "epoch": 0.7950642411906882, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.772235870361328, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8413866758346558, "num_tokens": 238427935.0, "step": 6250 }, { "epoch": 0.7951914514692787, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.08031463623047, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8625889420509338, "num_tokens": 238463639.0, "step": 6251 }, { "epoch": 0.7953186617478692, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.700668334960938, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8700947761535645, "num_tokens": 238498295.0, "step": 6252 }, { "epoch": 0.7954458720264598, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.989721298217773, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8561979532241821, "num_tokens": 238538109.0, "step": 6253 }, { "epoch": 0.7955730823050502, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.675823211669922, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8434141278266907, "num_tokens": 238574415.0, "step": 6254 }, { "epoch": 0.7957002925836407, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.0991096496582, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8583799600601196, "num_tokens": 238616368.0, "step": 6255 }, { "epoch": 0.7958275028622313, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.848880767822266, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.861396074295044, "num_tokens": 238650634.0, "step": 6256 }, { "epoch": 0.7959547131408218, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.874774932861328, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8513686656951904, "num_tokens": 238684383.0, "step": 6257 }, { "epoch": 0.7960819234194123, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.72890853881836, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8539963960647583, "num_tokens": 238722863.0, "step": 6258 }, { "epoch": 0.7962091336980028, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.15140914916992, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8587043881416321, "num_tokens": 238765027.0, "step": 6259 }, { "epoch": 0.7963363439765934, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.769901275634766, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8614903688430786, "num_tokens": 238801950.0, "step": 6260 }, { "epoch": 0.7964635542551838, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.810571670532227, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8489608764648438, "num_tokens": 238840963.0, "step": 6261 }, { "epoch": 0.7965907645337743, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.188751220703125, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8429616093635559, "num_tokens": 238876167.0, "step": 6262 }, { "epoch": 0.7967179748123648, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.952598571777344, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.861553966999054, "num_tokens": 238910293.0, "step": 6263 }, { "epoch": 0.7968451850909554, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.05344772338867, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8537134528160095, "num_tokens": 238947279.0, "step": 6264 }, { "epoch": 0.7969723953695459, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.02836990356445, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8463237285614014, "num_tokens": 238984148.0, "step": 6265 }, { "epoch": 0.7970996056481364, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.03421401977539, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8440722227096558, "num_tokens": 239022370.0, "step": 6266 }, { "epoch": 0.7972268159267268, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.940446853637695, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8536339998245239, "num_tokens": 239064542.0, "step": 6267 }, { "epoch": 0.7973540262053174, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.19063186645508, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8558684587478638, "num_tokens": 239103193.0, "step": 6268 }, { "epoch": 0.7974812364839079, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.887977600097656, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8640997409820557, "num_tokens": 239137952.0, "step": 6269 }, { "epoch": 0.7976084467624984, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.161376953125, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8464722633361816, "num_tokens": 239174600.0, "step": 6270 }, { "epoch": 0.797735657041089, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.806516647338867, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8634188771247864, "num_tokens": 239206157.0, "step": 6271 }, { "epoch": 0.7978628673196795, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.90892791748047, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8622235059738159, "num_tokens": 239243094.0, "step": 6272 }, { "epoch": 0.7979900775982699, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.98242950439453, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8579615354537964, "num_tokens": 239281208.0, "step": 6273 }, { "epoch": 0.7981172878768604, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.841360092163086, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8533904552459717, "num_tokens": 239320159.0, "step": 6274 }, { "epoch": 0.798244498155451, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.34233474731445, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8450955748558044, "num_tokens": 239360160.0, "step": 6275 }, { "epoch": 0.7983717084340415, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.134300231933594, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.860209584236145, "num_tokens": 239398101.0, "step": 6276 }, { "epoch": 0.798498918712632, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.07830810546875, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.872002899646759, "num_tokens": 239438560.0, "step": 6277 }, { "epoch": 0.7986261289912225, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.500518798828125, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8515174984931946, "num_tokens": 239475851.0, "step": 6278 }, { "epoch": 0.798753339269813, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.864477157592773, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8610195517539978, "num_tokens": 239519165.0, "step": 6279 }, { "epoch": 0.7988805495484035, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.33573913574219, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8447744846343994, "num_tokens": 239560602.0, "step": 6280 }, { "epoch": 0.799007759826994, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.199134826660156, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.859139084815979, "num_tokens": 239591933.0, "step": 6281 }, { "epoch": 0.7991349701055845, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.40608596801758, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.852100133895874, "num_tokens": 239625495.0, "step": 6282 }, { "epoch": 0.7992621803841751, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.8532657623291, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8730672597885132, "num_tokens": 239658249.0, "step": 6283 }, { "epoch": 0.7993893906627656, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.56842803955078, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8477773070335388, "num_tokens": 239690194.0, "step": 6284 }, { "epoch": 0.799516600941356, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.88633155822754, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8552015423774719, "num_tokens": 239728315.0, "step": 6285 }, { "epoch": 0.7996438112199465, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.57803726196289, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8619006872177124, "num_tokens": 239764505.0, "step": 6286 }, { "epoch": 0.7997710214985371, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.965831756591797, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8411887884140015, "num_tokens": 239809461.0, "step": 6287 }, { "epoch": 0.7998982317771276, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 32.34132766723633, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8513625264167786, "num_tokens": 239845232.0, "step": 6288 }, { "epoch": 0.8000254420557181, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.13296890258789, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8620074987411499, "num_tokens": 239885489.0, "step": 6289 }, { "epoch": 0.8001526523343087, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.11806869506836, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8435403108596802, "num_tokens": 239921159.0, "step": 6290 }, { "epoch": 0.8002798626128991, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.12101364135742, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.86234050989151, "num_tokens": 239960879.0, "step": 6291 }, { "epoch": 0.8004070728914896, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.86701202392578, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8458490371704102, "num_tokens": 239994302.0, "step": 6292 }, { "epoch": 0.8005342831700801, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.06477737426758, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8625643253326416, "num_tokens": 240027759.0, "step": 6293 }, { "epoch": 0.8006614934486707, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 32.20488739013672, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8525230884552002, "num_tokens": 240065049.0, "step": 6294 }, { "epoch": 0.8007887037272612, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.776086807250977, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8432179093360901, "num_tokens": 240101453.0, "step": 6295 }, { "epoch": 0.8009159140058517, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 32.34514236450195, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8358373641967773, "num_tokens": 240136831.0, "step": 6296 }, { "epoch": 0.8010431242844421, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.94911003112793, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8550615310668945, "num_tokens": 240173364.0, "step": 6297 }, { "epoch": 0.8011703345630327, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.98417854309082, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8341852426528931, "num_tokens": 240206718.0, "step": 6298 }, { "epoch": 0.8012975448416232, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.877851486206055, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8448408842086792, "num_tokens": 240245633.0, "step": 6299 }, { "epoch": 0.8014247551202137, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.158226013183594, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.869483470916748, "num_tokens": 240283887.0, "step": 6300 }, { "epoch": 0.8015519653988042, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.0661735534668, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8485875129699707, "num_tokens": 240322208.0, "step": 6301 }, { "epoch": 0.8016791756773948, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.03723907470703, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8601529002189636, "num_tokens": 240356154.0, "step": 6302 }, { "epoch": 0.8018063859559852, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.83945655822754, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8557543158531189, "num_tokens": 240389103.0, "step": 6303 }, { "epoch": 0.8019335962345757, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.349090576171875, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8730708956718445, "num_tokens": 240429545.0, "step": 6304 }, { "epoch": 0.8020608065131662, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.82245635986328, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8744738698005676, "num_tokens": 240468556.0, "step": 6305 }, { "epoch": 0.8021880167917568, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.512962341308594, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8575055599212646, "num_tokens": 240508534.0, "step": 6306 }, { "epoch": 0.8023152270703473, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.771556854248047, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8566974401473999, "num_tokens": 240540833.0, "step": 6307 }, { "epoch": 0.8024424373489378, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.44840621948242, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8476077318191528, "num_tokens": 240577768.0, "step": 6308 }, { "epoch": 0.8025696476275284, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.813058853149414, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8514193296432495, "num_tokens": 240610607.0, "step": 6309 }, { "epoch": 0.8026968579061188, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.224422454833984, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8535215258598328, "num_tokens": 240646056.0, "step": 6310 }, { "epoch": 0.8028240681847093, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.24052047729492, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8635041117668152, "num_tokens": 240682513.0, "step": 6311 }, { "epoch": 0.8029512784632998, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.271183013916016, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8578206300735474, "num_tokens": 240716311.0, "step": 6312 }, { "epoch": 0.8030784887418904, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.32203674316406, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8482162952423096, "num_tokens": 240758873.0, "step": 6313 }, { "epoch": 0.8032056990204809, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.30370330810547, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8612210750579834, "num_tokens": 240797163.0, "step": 6314 }, { "epoch": 0.8033329092990714, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.91607666015625, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8677307367324829, "num_tokens": 240835719.0, "step": 6315 }, { "epoch": 0.8034601195776618, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.321624755859375, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.851000964641571, "num_tokens": 240874359.0, "step": 6316 }, { "epoch": 0.8035873298562524, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.78152847290039, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8693234324455261, "num_tokens": 240909125.0, "step": 6317 }, { "epoch": 0.8037145401348429, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.17121124267578, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8597575426101685, "num_tokens": 240949391.0, "step": 6318 }, { "epoch": 0.8038417504134334, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.860193252563477, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8462717533111572, "num_tokens": 240987390.0, "step": 6319 }, { "epoch": 0.803968960692024, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.353431701660156, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8484903573989868, "num_tokens": 241025359.0, "step": 6320 }, { "epoch": 0.8040961709706145, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.674074172973633, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8498985767364502, "num_tokens": 241060532.0, "step": 6321 }, { "epoch": 0.8042233812492049, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 32.39175033569336, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8631216287612915, "num_tokens": 241098547.0, "step": 6322 }, { "epoch": 0.8043505915277954, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.648576736450195, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8548098802566528, "num_tokens": 241131091.0, "step": 6323 }, { "epoch": 0.804477801806386, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.44961166381836, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8463534116744995, "num_tokens": 241164495.0, "step": 6324 }, { "epoch": 0.8046050120849765, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.881887435913086, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8497699499130249, "num_tokens": 241203777.0, "step": 6325 }, { "epoch": 0.804732222363567, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.40650939941406, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8494040966033936, "num_tokens": 241243219.0, "step": 6326 }, { "epoch": 0.8048594326421575, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.826749801635742, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8398710489273071, "num_tokens": 241284547.0, "step": 6327 }, { "epoch": 0.804986642920748, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.95490074157715, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8375931978225708, "num_tokens": 241331110.0, "step": 6328 }, { "epoch": 0.8051138531993385, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 32.364864349365234, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8560711145401001, "num_tokens": 241372720.0, "step": 6329 }, { "epoch": 0.805241063477929, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.54004669189453, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8582614660263062, "num_tokens": 241408173.0, "step": 6330 }, { "epoch": 0.8053682737565195, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.531070709228516, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8630285263061523, "num_tokens": 241445021.0, "step": 6331 }, { "epoch": 0.8054954840351101, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.815786361694336, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8618783950805664, "num_tokens": 241481874.0, "step": 6332 }, { "epoch": 0.8056226943137006, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.46381759643555, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8418771028518677, "num_tokens": 241523140.0, "step": 6333 }, { "epoch": 0.805749904592291, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.781620025634766, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8371598720550537, "num_tokens": 241563424.0, "step": 6334 }, { "epoch": 0.8058771148708815, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.26744842529297, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8689742088317871, "num_tokens": 241600004.0, "step": 6335 }, { "epoch": 0.8060043251494721, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.93636131286621, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8540645837783813, "num_tokens": 241642704.0, "step": 6336 }, { "epoch": 0.8061315354280626, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.1083869934082, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8599648475646973, "num_tokens": 241679176.0, "step": 6337 }, { "epoch": 0.8062587457066531, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.682132720947266, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8601779937744141, "num_tokens": 241721168.0, "step": 6338 }, { "epoch": 0.8063859559852437, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.25430679321289, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8702265024185181, "num_tokens": 241759735.0, "step": 6339 }, { "epoch": 0.8065131662638341, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.755788803100586, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8577138185501099, "num_tokens": 241799445.0, "step": 6340 }, { "epoch": 0.8066403765424246, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.08325958251953, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.861065685749054, "num_tokens": 241839333.0, "step": 6341 }, { "epoch": 0.8067675868210151, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.766212463378906, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8456375002861023, "num_tokens": 241880248.0, "step": 6342 }, { "epoch": 0.8068947970996057, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.40509033203125, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8696867823600769, "num_tokens": 241926717.0, "step": 6343 }, { "epoch": 0.8070220073781962, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.639217376708984, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8475871086120605, "num_tokens": 241966080.0, "step": 6344 }, { "epoch": 0.8071492176567867, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.84028625488281, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8523756265640259, "num_tokens": 242011975.0, "step": 6345 }, { "epoch": 0.8072764279353771, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.672069549560547, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8686824440956116, "num_tokens": 242046320.0, "step": 6346 }, { "epoch": 0.8074036382139677, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.609989166259766, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8577567934989929, "num_tokens": 242085157.0, "step": 6347 }, { "epoch": 0.8075308484925582, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.89936637878418, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8512046337127686, "num_tokens": 242116548.0, "step": 6348 }, { "epoch": 0.8076580587711487, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.198272705078125, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8630330562591553, "num_tokens": 242151883.0, "step": 6349 }, { "epoch": 0.8077852690497392, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.188987731933594, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8533700108528137, "num_tokens": 242190486.0, "step": 6350 }, { "epoch": 0.8079124793283298, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.84244155883789, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.860002875328064, "num_tokens": 242232720.0, "step": 6351 }, { "epoch": 0.8080396896069202, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.3784065246582, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8579477071762085, "num_tokens": 242273926.0, "step": 6352 }, { "epoch": 0.8081668998855107, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.896377563476562, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8461942672729492, "num_tokens": 242313484.0, "step": 6353 }, { "epoch": 0.8082941101641012, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.983335494995117, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8494477272033691, "num_tokens": 242352101.0, "step": 6354 }, { "epoch": 0.8084213204426918, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.1051025390625, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8719523549079895, "num_tokens": 242392958.0, "step": 6355 }, { "epoch": 0.8085485307212823, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.90704917907715, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8566262722015381, "num_tokens": 242432327.0, "step": 6356 }, { "epoch": 0.8086757409998728, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.377986907958984, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8362364172935486, "num_tokens": 242473714.0, "step": 6357 }, { "epoch": 0.8088029512784632, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.07017135620117, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8635804653167725, "num_tokens": 242513179.0, "step": 6358 }, { "epoch": 0.8089301615570538, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.23749542236328, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8545992970466614, "num_tokens": 242549398.0, "step": 6359 }, { "epoch": 0.8090573718356443, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.260135650634766, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8513336777687073, "num_tokens": 242583705.0, "step": 6360 }, { "epoch": 0.8091845821142348, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.299095153808594, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8615593910217285, "num_tokens": 242625900.0, "step": 6361 }, { "epoch": 0.8093117923928254, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.850540161132812, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8453423380851746, "num_tokens": 242665766.0, "step": 6362 }, { "epoch": 0.8094390026714159, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.49542236328125, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8578066825866699, "num_tokens": 242707634.0, "step": 6363 }, { "epoch": 0.8095662129500064, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.76059341430664, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8523206114768982, "num_tokens": 242746791.0, "step": 6364 }, { "epoch": 0.8096934232285968, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.13347244262695, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8684741258621216, "num_tokens": 242784861.0, "step": 6365 }, { "epoch": 0.8098206335071874, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.932003021240234, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8710211515426636, "num_tokens": 242817612.0, "step": 6366 }, { "epoch": 0.8099478437857779, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.737213134765625, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.862910270690918, "num_tokens": 242854951.0, "step": 6367 }, { "epoch": 0.8100750540643684, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.39316177368164, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8711488842964172, "num_tokens": 242894166.0, "step": 6368 }, { "epoch": 0.8102022643429589, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.696012496948242, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8624023199081421, "num_tokens": 242936230.0, "step": 6369 }, { "epoch": 0.8103294746215495, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.14580535888672, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8679623007774353, "num_tokens": 242973148.0, "step": 6370 }, { "epoch": 0.8104566849001399, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.758888244628906, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8569823503494263, "num_tokens": 243013029.0, "step": 6371 }, { "epoch": 0.8105838951787304, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.07160186767578, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8560158014297485, "num_tokens": 243054847.0, "step": 6372 }, { "epoch": 0.810711105457321, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.741764068603516, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8504473567008972, "num_tokens": 243096524.0, "step": 6373 }, { "epoch": 0.8108383157359115, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.12874984741211, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8489294052124023, "num_tokens": 243132397.0, "step": 6374 }, { "epoch": 0.810965526014502, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.536531448364258, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.843173086643219, "num_tokens": 243171790.0, "step": 6375 }, { "epoch": 0.8110927362930925, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.0525016784668, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.855110764503479, "num_tokens": 243208217.0, "step": 6376 }, { "epoch": 0.811219946571683, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.078956604003906, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8620551228523254, "num_tokens": 243245260.0, "step": 6377 }, { "epoch": 0.8113471568502735, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.65899085998535, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8635677099227905, "num_tokens": 243283924.0, "step": 6378 }, { "epoch": 0.811474367128864, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31760787963867, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8640543222427368, "num_tokens": 243321977.0, "step": 6379 }, { "epoch": 0.8116015774074545, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.519088745117188, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8539705276489258, "num_tokens": 243359349.0, "step": 6380 }, { "epoch": 0.8117287876860451, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.26845169067383, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8594397306442261, "num_tokens": 243395722.0, "step": 6381 }, { "epoch": 0.8118559979646356, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.775901794433594, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8443222641944885, "num_tokens": 243429022.0, "step": 6382 }, { "epoch": 0.811983208243226, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.45247268676758, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8360192775726318, "num_tokens": 243468474.0, "step": 6383 }, { "epoch": 0.8121104185218165, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.746143341064453, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8714362382888794, "num_tokens": 243503045.0, "step": 6384 }, { "epoch": 0.8122376288004071, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.67601013183594, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8462223410606384, "num_tokens": 243543865.0, "step": 6385 }, { "epoch": 0.8123648390789976, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.95709228515625, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.848532497882843, "num_tokens": 243581258.0, "step": 6386 }, { "epoch": 0.8124920493575881, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.48805236816406, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8497073650360107, "num_tokens": 243622777.0, "step": 6387 }, { "epoch": 0.8126192596361786, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.716400146484375, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8437302112579346, "num_tokens": 243664549.0, "step": 6388 }, { "epoch": 0.8127464699147691, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.498992919921875, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8514989614486694, "num_tokens": 243710408.0, "step": 6389 }, { "epoch": 0.8128736801933596, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.44011116027832, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8579151630401611, "num_tokens": 243752598.0, "step": 6390 }, { "epoch": 0.8130008904719501, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.51343536376953, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8539363145828247, "num_tokens": 243789805.0, "step": 6391 }, { "epoch": 0.8131281007505406, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.93531036376953, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.867215633392334, "num_tokens": 243824124.0, "step": 6392 }, { "epoch": 0.8132553110291312, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.40031433105469, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8561683893203735, "num_tokens": 243863448.0, "step": 6393 }, { "epoch": 0.8133825213077217, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.765607833862305, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8604941368103027, "num_tokens": 243902085.0, "step": 6394 }, { "epoch": 0.8135097315863121, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.315101623535156, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.85056471824646, "num_tokens": 243945349.0, "step": 6395 }, { "epoch": 0.8136369418649027, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.044315338134766, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8537192344665527, "num_tokens": 243982221.0, "step": 6396 }, { "epoch": 0.8137641521434932, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.925050735473633, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8442604541778564, "num_tokens": 244012576.0, "step": 6397 }, { "epoch": 0.8138913624220837, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.17060089111328, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8640104532241821, "num_tokens": 244052089.0, "step": 6398 }, { "epoch": 0.8140185727006742, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.95662498474121, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8582572340965271, "num_tokens": 244088712.0, "step": 6399 }, { "epoch": 0.8141457829792648, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.06440734863281, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8578311800956726, "num_tokens": 244127655.0, "step": 6400 }, { "epoch": 0.8142729932578552, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.795337677001953, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8608908653259277, "num_tokens": 244158588.0, "step": 6401 }, { "epoch": 0.8144002035364457, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.03485107421875, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8757063150405884, "num_tokens": 244193872.0, "step": 6402 }, { "epoch": 0.8145274138150362, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.0578498840332, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8395988345146179, "num_tokens": 244234055.0, "step": 6403 }, { "epoch": 0.8146546240936268, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.14181137084961, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8557480573654175, "num_tokens": 244275163.0, "step": 6404 }, { "epoch": 0.8147818343722173, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.800439834594727, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.852514386177063, "num_tokens": 244315241.0, "step": 6405 }, { "epoch": 0.8149090446508078, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.185909271240234, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8710340261459351, "num_tokens": 244351947.0, "step": 6406 }, { "epoch": 0.8150362549293982, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.14244842529297, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8546833992004395, "num_tokens": 244387706.0, "step": 6407 }, { "epoch": 0.8151634652079888, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.1623420715332, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8641980886459351, "num_tokens": 244428447.0, "step": 6408 }, { "epoch": 0.8152906754865793, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.200618743896484, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8414246439933777, "num_tokens": 244471191.0, "step": 6409 }, { "epoch": 0.8154178857651698, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.262596130371094, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8390675783157349, "num_tokens": 244509198.0, "step": 6410 }, { "epoch": 0.8155450960437604, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.01877212524414, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8476046323776245, "num_tokens": 244549677.0, "step": 6411 }, { "epoch": 0.8156723063223509, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.09526824951172, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.843480110168457, "num_tokens": 244583586.0, "step": 6412 }, { "epoch": 0.8157995166009414, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.10068130493164, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.86436927318573, "num_tokens": 244621220.0, "step": 6413 }, { "epoch": 0.8159267268795318, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.942378997802734, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8639244437217712, "num_tokens": 244655941.0, "step": 6414 }, { "epoch": 0.8160539371581224, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.17336654663086, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8731937408447266, "num_tokens": 244685807.0, "step": 6415 }, { "epoch": 0.8161811474367129, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.202362060546875, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8543478846549988, "num_tokens": 244726866.0, "step": 6416 }, { "epoch": 0.8163083577153034, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.905723571777344, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.847633421421051, "num_tokens": 244767273.0, "step": 6417 }, { "epoch": 0.8164355679938939, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.206085205078125, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8511934876441956, "num_tokens": 244806111.0, "step": 6418 }, { "epoch": 0.8165627782724845, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.07560348510742, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8670348525047302, "num_tokens": 244841706.0, "step": 6419 }, { "epoch": 0.8166899885510749, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.94631576538086, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8594424724578857, "num_tokens": 244883908.0, "step": 6420 }, { "epoch": 0.8168171988296654, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.13288879394531, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8474017977714539, "num_tokens": 244919090.0, "step": 6421 }, { "epoch": 0.8169444091082559, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.254852294921875, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.870364785194397, "num_tokens": 244961141.0, "step": 6422 }, { "epoch": 0.8170716193868465, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.426422119140625, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8390004634857178, "num_tokens": 245004503.0, "step": 6423 }, { "epoch": 0.817198829665437, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.15842819213867, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8543327450752258, "num_tokens": 245045042.0, "step": 6424 }, { "epoch": 0.8173260399440275, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.15742874145508, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8546975255012512, "num_tokens": 245085416.0, "step": 6425 }, { "epoch": 0.8174532502226179, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.57487106323242, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8445062637329102, "num_tokens": 245116137.0, "step": 6426 }, { "epoch": 0.8175804605012085, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.908632278442383, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8682001829147339, "num_tokens": 245159181.0, "step": 6427 }, { "epoch": 0.817707670779799, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.23979568481445, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8490726947784424, "num_tokens": 245205199.0, "step": 6428 }, { "epoch": 0.8178348810583895, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.0961799621582, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8605412840843201, "num_tokens": 245242455.0, "step": 6429 }, { "epoch": 0.8179620913369801, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.10554504394531, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8756769299507141, "num_tokens": 245280675.0, "step": 6430 }, { "epoch": 0.8180893016155706, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.21201705932617, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8713226318359375, "num_tokens": 245316947.0, "step": 6431 }, { "epoch": 0.818216511894161, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.114383697509766, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8611528873443604, "num_tokens": 245355145.0, "step": 6432 }, { "epoch": 0.8183437221727515, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.950557708740234, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8674662709236145, "num_tokens": 245390282.0, "step": 6433 }, { "epoch": 0.8184709324513421, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.25336837768555, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8663657903671265, "num_tokens": 245430262.0, "step": 6434 }, { "epoch": 0.8185981427299326, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.30837631225586, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8536604642868042, "num_tokens": 245470874.0, "step": 6435 }, { "epoch": 0.8187253530085231, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.25950241088867, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8628811836242676, "num_tokens": 245506626.0, "step": 6436 }, { "epoch": 0.8188525632871136, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.492584228515625, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8391817808151245, "num_tokens": 245546794.0, "step": 6437 }, { "epoch": 0.8189797735657041, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.04167175292969, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8587585687637329, "num_tokens": 245589544.0, "step": 6438 }, { "epoch": 0.8191069838442946, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.47734069824219, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8716498613357544, "num_tokens": 245628889.0, "step": 6439 }, { "epoch": 0.8192341941228851, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.8303279876709, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8527506589889526, "num_tokens": 245674615.0, "step": 6440 }, { "epoch": 0.8193614044014756, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.511356353759766, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8556804656982422, "num_tokens": 245713966.0, "step": 6441 }, { "epoch": 0.8194886146800662, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.24980163574219, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8616361618041992, "num_tokens": 245750035.0, "step": 6442 }, { "epoch": 0.8196158249586567, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.23728942871094, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8712967038154602, "num_tokens": 245788506.0, "step": 6443 }, { "epoch": 0.8197430352372471, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.345985412597656, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8590198755264282, "num_tokens": 245826547.0, "step": 6444 }, { "epoch": 0.8198702455158376, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.97332191467285, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8469939231872559, "num_tokens": 245874363.0, "step": 6445 }, { "epoch": 0.8199974557944282, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31195068359375, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.839476466178894, "num_tokens": 245917813.0, "step": 6446 }, { "epoch": 0.8201246660730187, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.28352737426758, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8528138399124146, "num_tokens": 245952177.0, "step": 6447 }, { "epoch": 0.8202518763516092, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.20867156982422, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8585240840911865, "num_tokens": 245992897.0, "step": 6448 }, { "epoch": 0.8203790866301998, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.282169342041016, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.864904522895813, "num_tokens": 246029657.0, "step": 6449 }, { "epoch": 0.8205062969087902, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.06416702270508, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8465596437454224, "num_tokens": 246072425.0, "step": 6450 }, { "epoch": 0.8206335071873807, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31230163574219, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8745906352996826, "num_tokens": 246113131.0, "step": 6451 }, { "epoch": 0.8207607174659712, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.098201751708984, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8587232232093811, "num_tokens": 246154548.0, "step": 6452 }, { "epoch": 0.8208879277445618, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.55080032348633, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8525298833847046, "num_tokens": 246189518.0, "step": 6453 }, { "epoch": 0.8210151380231523, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.01469039916992, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8522936105728149, "num_tokens": 246235248.0, "step": 6454 }, { "epoch": 0.8211423483017428, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.53410720825195, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8481531143188477, "num_tokens": 246277016.0, "step": 6455 }, { "epoch": 0.8212695585803332, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.887916564941406, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8525396585464478, "num_tokens": 246312352.0, "step": 6456 }, { "epoch": 0.8213967688589238, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.66758346557617, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8423638939857483, "num_tokens": 246355079.0, "step": 6457 }, { "epoch": 0.8215239791375143, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.03983688354492, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.844284176826477, "num_tokens": 246391073.0, "step": 6458 }, { "epoch": 0.8216511894161048, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.46257781982422, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8627943992614746, "num_tokens": 246435407.0, "step": 6459 }, { "epoch": 0.8217783996946953, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.215476989746094, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8426346778869629, "num_tokens": 246475303.0, "step": 6460 }, { "epoch": 0.8219056099732859, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.29680252075195, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8535627126693726, "num_tokens": 246515237.0, "step": 6461 }, { "epoch": 0.8220328202518764, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.2379264831543, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8517364859580994, "num_tokens": 246554958.0, "step": 6462 }, { "epoch": 0.8221600305304668, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.53771209716797, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8577268719673157, "num_tokens": 246592327.0, "step": 6463 }, { "epoch": 0.8222872408090574, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.29409408569336, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8610771894454956, "num_tokens": 246626965.0, "step": 6464 }, { "epoch": 0.8224144510876479, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.794002532958984, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8520433902740479, "num_tokens": 246662049.0, "step": 6465 }, { "epoch": 0.8225416613662384, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.069644927978516, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8581739068031311, "num_tokens": 246696989.0, "step": 6466 }, { "epoch": 0.8226688716448289, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.54428482055664, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8611810803413391, "num_tokens": 246738418.0, "step": 6467 }, { "epoch": 0.8227960819234195, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.80840301513672, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8574953675270081, "num_tokens": 246776779.0, "step": 6468 }, { "epoch": 0.8229232922020099, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.44910430908203, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8629938364028931, "num_tokens": 246814029.0, "step": 6469 }, { "epoch": 0.8230505024806004, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31106185913086, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8482325077056885, "num_tokens": 246856390.0, "step": 6470 }, { "epoch": 0.8231777127591909, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.41262435913086, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8568768501281738, "num_tokens": 246895563.0, "step": 6471 }, { "epoch": 0.8233049230377815, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.540771484375, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8528205752372742, "num_tokens": 246935956.0, "step": 6472 }, { "epoch": 0.823432133316372, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.36550521850586, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8517329096794128, "num_tokens": 246984765.0, "step": 6473 }, { "epoch": 0.8235593435949625, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.99762725830078, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8593512773513794, "num_tokens": 247017086.0, "step": 6474 }, { "epoch": 0.8236865538735529, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.684654235839844, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8625722527503967, "num_tokens": 247056195.0, "step": 6475 }, { "epoch": 0.8238137641521435, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.003013610839844, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8698337078094482, "num_tokens": 247094063.0, "step": 6476 }, { "epoch": 0.823940974430734, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.93362808227539, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.854369580745697, "num_tokens": 247131743.0, "step": 6477 }, { "epoch": 0.8240681847093245, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.284637451171875, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.853665292263031, "num_tokens": 247169477.0, "step": 6478 }, { "epoch": 0.824195394987915, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.4586067199707, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8644520044326782, "num_tokens": 247209159.0, "step": 6479 }, { "epoch": 0.8243226052665056, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.22781753540039, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8593822121620178, "num_tokens": 247247603.0, "step": 6480 }, { "epoch": 0.824449815545096, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.3050651550293, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.866082489490509, "num_tokens": 247290184.0, "step": 6481 }, { "epoch": 0.8245770258236865, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.58343505859375, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.838794469833374, "num_tokens": 247331862.0, "step": 6482 }, { "epoch": 0.824704236102277, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.049476623535156, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8568487763404846, "num_tokens": 247368271.0, "step": 6483 }, { "epoch": 0.8248314463808676, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.55957794189453, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8561623096466064, "num_tokens": 247408077.0, "step": 6484 }, { "epoch": 0.8249586566594581, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.27747344970703, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8646877408027649, "num_tokens": 247447408.0, "step": 6485 }, { "epoch": 0.8250858669380486, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.51651382446289, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.868351936340332, "num_tokens": 247489863.0, "step": 6486 }, { "epoch": 0.8252130772166391, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.18193817138672, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8522160649299622, "num_tokens": 247532233.0, "step": 6487 }, { "epoch": 0.8253402874952296, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.44438552856445, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8308200836181641, "num_tokens": 247570719.0, "step": 6488 }, { "epoch": 0.8254674977738201, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.32370376586914, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8621101975440979, "num_tokens": 247603302.0, "step": 6489 }, { "epoch": 0.8255947080524106, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.66526794433594, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8598830103874207, "num_tokens": 247642837.0, "step": 6490 }, { "epoch": 0.8257219183310012, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.312442779541016, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8546213507652283, "num_tokens": 247685246.0, "step": 6491 }, { "epoch": 0.8258491286095917, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.55252456665039, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8477261662483215, "num_tokens": 247720871.0, "step": 6492 }, { "epoch": 0.8259763388881821, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.22959899902344, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8577312231063843, "num_tokens": 247759796.0, "step": 6493 }, { "epoch": 0.8261035491667726, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.70170593261719, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8353027105331421, "num_tokens": 247806486.0, "step": 6494 }, { "epoch": 0.8262307594453632, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.16029739379883, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8646605014801025, "num_tokens": 247842031.0, "step": 6495 }, { "epoch": 0.8263579697239537, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.29021072387695, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8603379130363464, "num_tokens": 247883438.0, "step": 6496 }, { "epoch": 0.8264851800025442, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.4448356628418, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8640620112419128, "num_tokens": 247921491.0, "step": 6497 }, { "epoch": 0.8266123902811348, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.01481246948242, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8675855994224548, "num_tokens": 247957454.0, "step": 6498 }, { "epoch": 0.8267396005597252, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.685638427734375, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.841690182685852, "num_tokens": 247997705.0, "step": 6499 }, { "epoch": 0.8268668108383157, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.256343841552734, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8522181510925293, "num_tokens": 248034536.0, "step": 6500 }, { "epoch": 0.8269940211169062, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.665992736816406, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8566792011260986, "num_tokens": 248080916.0, "step": 6501 }, { "epoch": 0.8271212313954968, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.18949890136719, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8556042909622192, "num_tokens": 248119719.0, "step": 6502 }, { "epoch": 0.8272484416740873, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.77699279785156, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8540886640548706, "num_tokens": 248156221.0, "step": 6503 }, { "epoch": 0.8273756519526778, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.247459411621094, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8586350679397583, "num_tokens": 248191780.0, "step": 6504 }, { "epoch": 0.8275028622312682, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.3017578125, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8518385887145996, "num_tokens": 248228594.0, "step": 6505 }, { "epoch": 0.8276300725098588, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.36432647705078, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8621670603752136, "num_tokens": 248260372.0, "step": 6506 }, { "epoch": 0.8277572827884493, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.699275970458984, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8489066362380981, "num_tokens": 248299361.0, "step": 6507 }, { "epoch": 0.8278844930670398, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.281429290771484, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8475096225738525, "num_tokens": 248337370.0, "step": 6508 }, { "epoch": 0.8280117033456303, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.32509994506836, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8592720031738281, "num_tokens": 248377541.0, "step": 6509 }, { "epoch": 0.8281389136242209, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.39267349243164, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8385891914367676, "num_tokens": 248413928.0, "step": 6510 }, { "epoch": 0.8282661239028114, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.0551643371582, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8558322191238403, "num_tokens": 248453283.0, "step": 6511 }, { "epoch": 0.8283933341814018, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.38160705566406, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.86223965883255, "num_tokens": 248490286.0, "step": 6512 }, { "epoch": 0.8285205444599923, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31087112426758, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8404723405838013, "num_tokens": 248526349.0, "step": 6513 }, { "epoch": 0.8286477547385829, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.15250015258789, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8486347198486328, "num_tokens": 248561751.0, "step": 6514 }, { "epoch": 0.8287749650171734, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.35255432128906, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8690506219863892, "num_tokens": 248601921.0, "step": 6515 }, { "epoch": 0.8289021752957639, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.28996658325195, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8674639463424683, "num_tokens": 248641218.0, "step": 6516 }, { "epoch": 0.8290293855743545, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.611141204833984, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8583313226699829, "num_tokens": 248672616.0, "step": 6517 }, { "epoch": 0.8291565958529449, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.397972106933594, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8600364327430725, "num_tokens": 248708212.0, "step": 6518 }, { "epoch": 0.8292838061315354, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.65798568725586, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8601089715957642, "num_tokens": 248746575.0, "step": 6519 }, { "epoch": 0.8294110164101259, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.24777603149414, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8528574705123901, "num_tokens": 248789423.0, "step": 6520 }, { "epoch": 0.8295382266887165, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.54122543334961, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8592644929885864, "num_tokens": 248824044.0, "step": 6521 }, { "epoch": 0.829665436967307, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.3909797668457, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8540708422660828, "num_tokens": 248860805.0, "step": 6522 }, { "epoch": 0.8297926472458975, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.393638610839844, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8622030019760132, "num_tokens": 248898958.0, "step": 6523 }, { "epoch": 0.8299198575244879, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.25115966796875, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8562925457954407, "num_tokens": 248938847.0, "step": 6524 }, { "epoch": 0.8300470678030785, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.382869720458984, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8534078001976013, "num_tokens": 248977944.0, "step": 6525 }, { "epoch": 0.830174278081669, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.58237838745117, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8505477905273438, "num_tokens": 249014610.0, "step": 6526 }, { "epoch": 0.8303014883602595, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.043357849121094, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8260147571563721, "num_tokens": 249053869.0, "step": 6527 }, { "epoch": 0.83042869863885, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.50376892089844, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8491519093513489, "num_tokens": 249096874.0, "step": 6528 }, { "epoch": 0.8305559089174406, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.36840057373047, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8536854982376099, "num_tokens": 249133141.0, "step": 6529 }, { "epoch": 0.830683119196031, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.67517852783203, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8609861135482788, "num_tokens": 249163813.0, "step": 6530 }, { "epoch": 0.8308103294746215, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.44856643676758, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8490476012229919, "num_tokens": 249202935.0, "step": 6531 }, { "epoch": 0.830937539753212, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.410404205322266, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8636592626571655, "num_tokens": 249241061.0, "step": 6532 }, { "epoch": 0.8310647500318026, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.35373306274414, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.84357750415802, "num_tokens": 249277863.0, "step": 6533 }, { "epoch": 0.8311919603103931, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.50035095214844, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8560866713523865, "num_tokens": 249318047.0, "step": 6534 }, { "epoch": 0.8313191705889836, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.52641296386719, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8511818647384644, "num_tokens": 249362095.0, "step": 6535 }, { "epoch": 0.831446380867574, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.651878356933594, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8706506490707397, "num_tokens": 249403362.0, "step": 6536 }, { "epoch": 0.8315735911461646, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.37376403808594, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8637837767601013, "num_tokens": 249438051.0, "step": 6537 }, { "epoch": 0.8317008014247551, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.461021423339844, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8610197305679321, "num_tokens": 249477185.0, "step": 6538 }, { "epoch": 0.8318280117033456, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.302833557128906, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8497051000595093, "num_tokens": 249516880.0, "step": 6539 }, { "epoch": 0.8319552219819362, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.61027908325195, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8663806319236755, "num_tokens": 249555864.0, "step": 6540 }, { "epoch": 0.8320824322605267, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.371795654296875, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8549786806106567, "num_tokens": 249590566.0, "step": 6541 }, { "epoch": 0.8322096425391171, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.4129524230957, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8688240051269531, "num_tokens": 249631523.0, "step": 6542 }, { "epoch": 0.8323368528177076, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.7236328125, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8641525506973267, "num_tokens": 249674113.0, "step": 6543 }, { "epoch": 0.8324640630962982, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.41039276123047, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8679612278938293, "num_tokens": 249712934.0, "step": 6544 }, { "epoch": 0.8325912733748887, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.592185974121094, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8615447282791138, "num_tokens": 249750295.0, "step": 6545 }, { "epoch": 0.8327184836534792, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.32712173461914, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8616626858711243, "num_tokens": 249790289.0, "step": 6546 }, { "epoch": 0.8328456939320698, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.8958625793457, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8688527345657349, "num_tokens": 249822873.0, "step": 6547 }, { "epoch": 0.8329729042106602, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.28901290893555, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8573258519172668, "num_tokens": 249860564.0, "step": 6548 }, { "epoch": 0.8331001144892507, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.13359069824219, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8555445671081543, "num_tokens": 249901448.0, "step": 6549 }, { "epoch": 0.8332273247678412, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.53987503051758, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8447728753089905, "num_tokens": 249933524.0, "step": 6550 }, { "epoch": 0.8333545350464318, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.79780578613281, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8518493175506592, "num_tokens": 249971856.0, "step": 6551 }, { "epoch": 0.8334817453250223, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.58526611328125, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.848160982131958, "num_tokens": 250013940.0, "step": 6552 }, { "epoch": 0.8336089556036128, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.5618782043457, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8389092683792114, "num_tokens": 250044826.0, "step": 6553 }, { "epoch": 0.8337361658822032, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.778564453125, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.839596688747406, "num_tokens": 250087409.0, "step": 6554 }, { "epoch": 0.8338633761607938, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.38230514526367, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8620783090591431, "num_tokens": 250126236.0, "step": 6555 }, { "epoch": 0.8339905864393843, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.70214080810547, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8656581044197083, "num_tokens": 250163559.0, "step": 6556 }, { "epoch": 0.8341177967179748, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.54112243652344, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8579988479614258, "num_tokens": 250203536.0, "step": 6557 }, { "epoch": 0.8342450069965653, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.53295135498047, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8607972860336304, "num_tokens": 250237455.0, "step": 6558 }, { "epoch": 0.8343722172751559, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.59315490722656, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8653075098991394, "num_tokens": 250274559.0, "step": 6559 }, { "epoch": 0.8344994275537464, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.657405853271484, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8572598695755005, "num_tokens": 250309392.0, "step": 6560 }, { "epoch": 0.8346266378323368, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.4055290222168, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8624886274337769, "num_tokens": 250352010.0, "step": 6561 }, { "epoch": 0.8347538481109273, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.60734176635742, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8439674377441406, "num_tokens": 250385745.0, "step": 6562 }, { "epoch": 0.8348810583895179, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.08918380737305, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8550363779067993, "num_tokens": 250426055.0, "step": 6563 }, { "epoch": 0.8350082686681084, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.00199508666992, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8545969724655151, "num_tokens": 250468795.0, "step": 6564 }, { "epoch": 0.8351354789466989, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.16436767578125, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8548420667648315, "num_tokens": 250509115.0, "step": 6565 }, { "epoch": 0.8352626892252895, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.96054458618164, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8580180406570435, "num_tokens": 250547770.0, "step": 6566 }, { "epoch": 0.8353898995038799, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.593936920166016, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8527723550796509, "num_tokens": 250589737.0, "step": 6567 }, { "epoch": 0.8355171097824704, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.64558029174805, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8641290664672852, "num_tokens": 250623058.0, "step": 6568 }, { "epoch": 0.8356443200610609, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.60808563232422, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8571783304214478, "num_tokens": 250660526.0, "step": 6569 }, { "epoch": 0.8357715303396515, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.651588439941406, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.841281533241272, "num_tokens": 250698218.0, "step": 6570 }, { "epoch": 0.835898740618242, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.47939682006836, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8489025831222534, "num_tokens": 250743410.0, "step": 6571 }, { "epoch": 0.8360259508968325, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.74705123901367, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.865296483039856, "num_tokens": 250781919.0, "step": 6572 }, { "epoch": 0.8361531611754229, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.263118743896484, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8547816276550293, "num_tokens": 250817051.0, "step": 6573 }, { "epoch": 0.8362803714540135, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.78862762451172, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8534528017044067, "num_tokens": 250857241.0, "step": 6574 }, { "epoch": 0.836407581732604, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.21504211425781, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8578600287437439, "num_tokens": 250903591.0, "step": 6575 }, { "epoch": 0.8365347920111945, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.86025619506836, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8500438928604126, "num_tokens": 250945788.0, "step": 6576 }, { "epoch": 0.836662002289785, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.059261322021484, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8499337434768677, "num_tokens": 250984884.0, "step": 6577 }, { "epoch": 0.8367892125683756, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.00783157348633, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8689199686050415, "num_tokens": 251024347.0, "step": 6578 }, { "epoch": 0.836916422846966, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.087303161621094, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8778657913208008, "num_tokens": 251068527.0, "step": 6579 }, { "epoch": 0.8370436331255565, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.724124908447266, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.862112283706665, "num_tokens": 251109914.0, "step": 6580 }, { "epoch": 0.837170843404147, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.333805084228516, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8480172157287598, "num_tokens": 251144760.0, "step": 6581 }, { "epoch": 0.8372980536827376, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.1891975402832, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8550131320953369, "num_tokens": 251184456.0, "step": 6582 }, { "epoch": 0.8374252639613281, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.236751556396484, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8486300110816956, "num_tokens": 251229493.0, "step": 6583 }, { "epoch": 0.8375524742399186, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.81776428222656, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8623412847518921, "num_tokens": 251270886.0, "step": 6584 }, { "epoch": 0.837679684518509, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.46146011352539, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8658450245857239, "num_tokens": 251304833.0, "step": 6585 }, { "epoch": 0.8378068947970996, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.834476470947266, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8599928617477417, "num_tokens": 251334196.0, "step": 6586 }, { "epoch": 0.8379341050756901, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.8029899597168, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8451521396636963, "num_tokens": 251380278.0, "step": 6587 }, { "epoch": 0.8380613153542806, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.778079986572266, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8501003980636597, "num_tokens": 251418432.0, "step": 6588 }, { "epoch": 0.8381885256328712, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.908119201660156, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8545924425125122, "num_tokens": 251453747.0, "step": 6589 }, { "epoch": 0.8383157359114617, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.79556655883789, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8558756113052368, "num_tokens": 251492439.0, "step": 6590 }, { "epoch": 0.8384429461900521, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.81863021850586, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.856942892074585, "num_tokens": 251527945.0, "step": 6591 }, { "epoch": 0.8385701564686426, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.47742462158203, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8642522692680359, "num_tokens": 251567161.0, "step": 6592 }, { "epoch": 0.8386973667472332, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.82447052001953, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8565627336502075, "num_tokens": 251601189.0, "step": 6593 }, { "epoch": 0.8388245770258237, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.516239166259766, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8532638549804688, "num_tokens": 251630846.0, "step": 6594 }, { "epoch": 0.8389517873044142, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.82829284667969, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8643234968185425, "num_tokens": 251665701.0, "step": 6595 }, { "epoch": 0.8390789975830047, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.562660217285156, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8473273515701294, "num_tokens": 251699909.0, "step": 6596 }, { "epoch": 0.8392062078615952, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.16054916381836, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8547117710113525, "num_tokens": 251734052.0, "step": 6597 }, { "epoch": 0.8393334181401857, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.198062896728516, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8452359437942505, "num_tokens": 251772518.0, "step": 6598 }, { "epoch": 0.8394606284187762, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 33.243980407714844, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.853874921798706, "num_tokens": 251816378.0, "step": 6599 }, { "epoch": 0.8395878386973668, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.33182144165039, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8563307523727417, "num_tokens": 251851264.0, "step": 6600 }, { "epoch": 0.8397150489759573, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.74008560180664, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.858453631401062, "num_tokens": 251888906.0, "step": 6601 }, { "epoch": 0.8398422592545478, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.274356842041016, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8630359172821045, "num_tokens": 251930641.0, "step": 6602 }, { "epoch": 0.8399694695331382, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.17525100708008, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8389285802841187, "num_tokens": 251968929.0, "step": 6603 }, { "epoch": 0.8400966798117288, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.385101318359375, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8486725091934204, "num_tokens": 252006751.0, "step": 6604 }, { "epoch": 0.8402238900903193, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 33.2354621887207, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.851917028427124, "num_tokens": 252045861.0, "step": 6605 }, { "epoch": 0.8403511003689098, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.41903305053711, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8490937948226929, "num_tokens": 252084412.0, "step": 6606 }, { "epoch": 0.8404783106475003, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.26487350463867, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8606724143028259, "num_tokens": 252120576.0, "step": 6607 }, { "epoch": 0.8406055209260909, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.45112991333008, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8548431396484375, "num_tokens": 252161636.0, "step": 6608 }, { "epoch": 0.8407327312046813, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.55026626586914, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8569623231887817, "num_tokens": 252199669.0, "step": 6609 }, { "epoch": 0.8408599414832718, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.51116943359375, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8644391894340515, "num_tokens": 252236832.0, "step": 6610 }, { "epoch": 0.8409871517618623, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.31363296508789, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8514001965522766, "num_tokens": 252282518.0, "step": 6611 }, { "epoch": 0.8411143620404529, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.55147933959961, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8504689335823059, "num_tokens": 252316479.0, "step": 6612 }, { "epoch": 0.8412415723190434, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.16902160644531, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8511696457862854, "num_tokens": 252353789.0, "step": 6613 }, { "epoch": 0.8413687825976339, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.55160903930664, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8587659597396851, "num_tokens": 252391609.0, "step": 6614 }, { "epoch": 0.8414959928762245, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.7619743347168, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8695055246353149, "num_tokens": 252433514.0, "step": 6615 }, { "epoch": 0.8416232031548149, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.83686828613281, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8610278367996216, "num_tokens": 252469700.0, "step": 6616 }, { "epoch": 0.8417504134334054, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.32282257080078, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8533468842506409, "num_tokens": 252501658.0, "step": 6617 }, { "epoch": 0.8418776237119959, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.02509689331055, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8586515188217163, "num_tokens": 252534546.0, "step": 6618 }, { "epoch": 0.8420048339905865, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.45161819458008, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8636100888252258, "num_tokens": 252573172.0, "step": 6619 }, { "epoch": 0.842132044269177, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.893314361572266, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8657877445220947, "num_tokens": 252607258.0, "step": 6620 }, { "epoch": 0.8422592545477675, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.56455612182617, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8672975897789001, "num_tokens": 252651451.0, "step": 6621 }, { "epoch": 0.8423864648263579, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.606082916259766, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8687788248062134, "num_tokens": 252686348.0, "step": 6622 }, { "epoch": 0.8425136751049485, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.11445617675781, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8494681715965271, "num_tokens": 252724316.0, "step": 6623 }, { "epoch": 0.842640885383539, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.37199783325195, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8466026782989502, "num_tokens": 252764612.0, "step": 6624 }, { "epoch": 0.8427680956621295, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.945701599121094, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8347892165184021, "num_tokens": 252809510.0, "step": 6625 }, { "epoch": 0.84289530594072, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.34165954589844, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.854019045829773, "num_tokens": 252847598.0, "step": 6626 }, { "epoch": 0.8430225162193106, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.95848846435547, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8422949314117432, "num_tokens": 252884401.0, "step": 6627 }, { "epoch": 0.843149726497901, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.09236526489258, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8402817845344543, "num_tokens": 252928348.0, "step": 6628 }, { "epoch": 0.8432769367764915, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 33.31827163696289, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8550348281860352, "num_tokens": 252965084.0, "step": 6629 }, { "epoch": 0.843404147055082, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.00297546386719, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.860414981842041, "num_tokens": 253003697.0, "step": 6630 }, { "epoch": 0.8435313573336726, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.868995666503906, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.842288613319397, "num_tokens": 253037239.0, "step": 6631 }, { "epoch": 0.8436585676122631, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.86404800415039, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8481638431549072, "num_tokens": 253078163.0, "step": 6632 }, { "epoch": 0.8437857778908536, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 33.29839324951172, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8541418313980103, "num_tokens": 253118907.0, "step": 6633 }, { "epoch": 0.843912988169444, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.28948211669922, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8735764026641846, "num_tokens": 253153132.0, "step": 6634 }, { "epoch": 0.8440401984480346, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.033172607421875, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8534799814224243, "num_tokens": 253194446.0, "step": 6635 }, { "epoch": 0.8441674087266251, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.62111282348633, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8583789467811584, "num_tokens": 253230798.0, "step": 6636 }, { "epoch": 0.8442946190052156, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.780487060546875, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8480082750320435, "num_tokens": 253269029.0, "step": 6637 }, { "epoch": 0.8444218292838062, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.55750274658203, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8529483079910278, "num_tokens": 253310656.0, "step": 6638 }, { "epoch": 0.8445490395623967, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.6754150390625, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8746730089187622, "num_tokens": 253346882.0, "step": 6639 }, { "epoch": 0.8446762498409871, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.978084564208984, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8462380766868591, "num_tokens": 253389601.0, "step": 6640 }, { "epoch": 0.8448034601195776, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.55445098876953, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8539971113204956, "num_tokens": 253428725.0, "step": 6641 }, { "epoch": 0.8449306703981682, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.68140411376953, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8702610731124878, "num_tokens": 253461404.0, "step": 6642 }, { "epoch": 0.8450578806767587, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.85918045043945, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8597045540809631, "num_tokens": 253500324.0, "step": 6643 }, { "epoch": 0.8451850909553492, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.11024856567383, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8436671495437622, "num_tokens": 253542500.0, "step": 6644 }, { "epoch": 0.8453123012339397, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.85967254638672, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8380202054977417, "num_tokens": 253581887.0, "step": 6645 }, { "epoch": 0.8454395115125302, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.82651138305664, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8644758462905884, "num_tokens": 253616942.0, "step": 6646 }, { "epoch": 0.8455667217911207, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.045352935791016, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8509770631790161, "num_tokens": 253652610.0, "step": 6647 }, { "epoch": 0.8456939320697112, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.84760665893555, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8530581593513489, "num_tokens": 253686269.0, "step": 6648 }, { "epoch": 0.8458211423483017, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 33.015586853027344, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8504313230514526, "num_tokens": 253723632.0, "step": 6649 }, { "epoch": 0.8459483526268923, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.66245651245117, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8494845628738403, "num_tokens": 253755990.0, "step": 6650 }, { "epoch": 0.8460755629054828, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 33.18873596191406, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8501496315002441, "num_tokens": 253794200.0, "step": 6651 }, { "epoch": 0.8462027731840732, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 33.137298583984375, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8679301738739014, "num_tokens": 253840554.0, "step": 6652 }, { "epoch": 0.8463299834626637, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.76544189453125, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8643116354942322, "num_tokens": 253873642.0, "step": 6653 }, { "epoch": 0.8464571937412543, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 33.112709045410156, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8619363903999329, "num_tokens": 253911839.0, "step": 6654 }, { "epoch": 0.8465844040198448, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.842010498046875, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8462154865264893, "num_tokens": 253944649.0, "step": 6655 }, { "epoch": 0.8467116142984353, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 42.130638122558594, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8551450967788696, "num_tokens": 253977739.0, "step": 6656 }, { "epoch": 0.8468388245770259, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.59182357788086, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8418241739273071, "num_tokens": 254021216.0, "step": 6657 }, { "epoch": 0.8469660348556163, "ewc_loss": 0.09375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.39582443237305, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8534440398216248, "num_tokens": 254063209.0, "step": 6658 }, { "epoch": 0.8470932451342068, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.392635345458984, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8503291606903076, "num_tokens": 254102325.0, "step": 6659 }, { "epoch": 0.8472204554127973, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.684328079223633, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.861214280128479, "num_tokens": 254136991.0, "step": 6660 }, { "epoch": 0.8473476656913879, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.175493240356445, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8553450703620911, "num_tokens": 254179520.0, "step": 6661 }, { "epoch": 0.8474748759699784, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.13851547241211, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8561918139457703, "num_tokens": 254220661.0, "step": 6662 }, { "epoch": 0.8476020862485689, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 30.785282135009766, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8633701801300049, "num_tokens": 254261853.0, "step": 6663 }, { "epoch": 0.8477292965271594, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 30.95217514038086, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8575477600097656, "num_tokens": 254301577.0, "step": 6664 }, { "epoch": 0.8478565068057499, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 30.73533058166504, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8688133955001831, "num_tokens": 254341792.0, "step": 6665 }, { "epoch": 0.8479837170843404, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.03510856628418, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8427051901817322, "num_tokens": 254376755.0, "step": 6666 }, { "epoch": 0.8481109273629309, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 30.67418670654297, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8660957217216492, "num_tokens": 254413565.0, "step": 6667 }, { "epoch": 0.8482381376415215, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 30.711172103881836, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8447692394256592, "num_tokens": 254449889.0, "step": 6668 }, { "epoch": 0.848365347920112, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.66025161743164, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.86961829662323, "num_tokens": 254488129.0, "step": 6669 }, { "epoch": 0.8484925581987025, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.990468978881836, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8683274984359741, "num_tokens": 254522031.0, "step": 6670 }, { "epoch": 0.8486197684772929, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.371309280395508, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8675855994224548, "num_tokens": 254559249.0, "step": 6671 }, { "epoch": 0.8487469787558835, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.137754440307617, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8496088981628418, "num_tokens": 254603968.0, "step": 6672 }, { "epoch": 0.848874189034474, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.3626708984375, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8665529489517212, "num_tokens": 254642178.0, "step": 6673 }, { "epoch": 0.8490013993130645, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.054702758789062, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8485538959503174, "num_tokens": 254679129.0, "step": 6674 }, { "epoch": 0.849128609591655, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.56497573852539, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8274199962615967, "num_tokens": 254714952.0, "step": 6675 }, { "epoch": 0.8492558198702456, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.684370040893555, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8524913191795349, "num_tokens": 254743323.0, "step": 6676 }, { "epoch": 0.849383030148836, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.64546775817871, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8636506199836731, "num_tokens": 254784749.0, "step": 6677 }, { "epoch": 0.8495102404274265, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.65989875793457, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8625675439834595, "num_tokens": 254820102.0, "step": 6678 }, { "epoch": 0.849637450706017, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.442426681518555, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8531267642974854, "num_tokens": 254859988.0, "step": 6679 }, { "epoch": 0.8497646609846076, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.966102600097656, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8517926931381226, "num_tokens": 254906792.0, "step": 6680 }, { "epoch": 0.8498918712631981, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.603919982910156, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8580605387687683, "num_tokens": 254941552.0, "step": 6681 }, { "epoch": 0.8500190815417886, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.879491806030273, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8477407097816467, "num_tokens": 254974653.0, "step": 6682 }, { "epoch": 0.850146291820379, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.729736328125, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8404009342193604, "num_tokens": 255018418.0, "step": 6683 }, { "epoch": 0.8502735020989696, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.744707107543945, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8602122068405151, "num_tokens": 255049626.0, "step": 6684 }, { "epoch": 0.8504007123775601, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.712961196899414, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8502411842346191, "num_tokens": 255086323.0, "step": 6685 }, { "epoch": 0.8505279226561506, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.81096076965332, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8648037910461426, "num_tokens": 255124102.0, "step": 6686 }, { "epoch": 0.8506551329347412, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.739534378051758, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8602816462516785, "num_tokens": 255162176.0, "step": 6687 }, { "epoch": 0.8507823432133317, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.86777114868164, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8590071201324463, "num_tokens": 255196256.0, "step": 6688 }, { "epoch": 0.8509095534919221, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.602947235107422, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8496061563491821, "num_tokens": 255236752.0, "step": 6689 }, { "epoch": 0.8510367637705126, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.111873626708984, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8562767505645752, "num_tokens": 255268694.0, "step": 6690 }, { "epoch": 0.8511639740491032, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.729862213134766, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8613077998161316, "num_tokens": 255309445.0, "step": 6691 }, { "epoch": 0.8512911843276937, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.421382904052734, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8600038290023804, "num_tokens": 255341126.0, "step": 6692 }, { "epoch": 0.8514183946062842, "ewc_loss": 0.0830078125, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 6.532669067382812e-05, "grad_norm": 30.368751525878906, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8652359247207642, "num_tokens": 255379553.0, "step": 6693 }, { "epoch": 0.8515456048848747, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.33867073059082, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8573194742202759, "num_tokens": 255416588.0, "step": 6694 }, { "epoch": 0.8516728151634652, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.74468231201172, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8416299223899841, "num_tokens": 255449574.0, "step": 6695 }, { "epoch": 0.8518000254420557, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.036205291748047, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8544976115226746, "num_tokens": 255494088.0, "step": 6696 }, { "epoch": 0.8519272357206462, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.938928604125977, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8643955588340759, "num_tokens": 255529785.0, "step": 6697 }, { "epoch": 0.8520544459992367, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.327722549438477, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8426075577735901, "num_tokens": 255566675.0, "step": 6698 }, { "epoch": 0.8521816562778273, "ewc_loss": 0.08349609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.580352783203125e-05, "grad_norm": 30.896020889282227, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8598592281341553, "num_tokens": 255606502.0, "step": 6699 }, { "epoch": 0.8523088665564178, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.16048240661621, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8475280404090881, "num_tokens": 255649179.0, "step": 6700 }, { "epoch": 0.8524360768350082, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.192659378051758, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8564868569374084, "num_tokens": 255688190.0, "step": 6701 }, { "epoch": 0.8525632871135987, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.757516860961914, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8544731736183167, "num_tokens": 255730401.0, "step": 6702 }, { "epoch": 0.8526904973921893, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 31.449604034423828, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8512284755706787, "num_tokens": 255767881.0, "step": 6703 }, { "epoch": 0.8528177076707798, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 30.508766174316406, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8471320867538452, "num_tokens": 255808347.0, "step": 6704 }, { "epoch": 0.8529449179493703, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.410470962524414, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.838849663734436, "num_tokens": 255849538.0, "step": 6705 }, { "epoch": 0.8530721282279609, "ewc_loss": 0.083984375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 6.628036499023438e-05, "grad_norm": 30.70026969909668, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8669560551643372, "num_tokens": 255890228.0, "step": 6706 }, { "epoch": 0.8531993385065513, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.58453941345215, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8547399044036865, "num_tokens": 255930189.0, "step": 6707 }, { "epoch": 0.8533265487851418, "ewc_loss": 0.08447265625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 6.67572021484375e-05, "grad_norm": 30.872650146484375, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.863105833530426, "num_tokens": 255968310.0, "step": 6708 }, { "epoch": 0.8534537590637323, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.904281616210938, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8675397634506226, "num_tokens": 256005872.0, "step": 6709 }, { "epoch": 0.8535809693423229, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.723403930664062e-05, "grad_norm": 31.23611068725586, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8612837791442871, "num_tokens": 256045374.0, "step": 6710 }, { "epoch": 0.8537081796209134, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 30.97966957092285, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8587542772293091, "num_tokens": 256084874.0, "step": 6711 }, { "epoch": 0.8538353898995039, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.957990646362305, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8481138944625854, "num_tokens": 256122251.0, "step": 6712 }, { "epoch": 0.8539626001780944, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.983646392822266, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8548948764801025, "num_tokens": 256165025.0, "step": 6713 }, { "epoch": 0.8540898104566849, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.747587203979492, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8635687232017517, "num_tokens": 256202220.0, "step": 6714 }, { "epoch": 0.8542170207352754, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 31.173973083496094, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.858198881149292, "num_tokens": 256234036.0, "step": 6715 }, { "epoch": 0.8543442310138659, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.849472045898438, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8553569316864014, "num_tokens": 256275253.0, "step": 6716 }, { "epoch": 0.8544714412924564, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.02484130859375, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8535093069076538, "num_tokens": 256311296.0, "step": 6717 }, { "epoch": 0.854598651571047, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 30.9631404876709, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8617586493492126, "num_tokens": 256348149.0, "step": 6718 }, { "epoch": 0.8547258618496375, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.34856414794922, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8704417943954468, "num_tokens": 256387861.0, "step": 6719 }, { "epoch": 0.8548530721282279, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 30.970787048339844, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8657222986221313, "num_tokens": 256427131.0, "step": 6720 }, { "epoch": 0.8549802824068184, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.323680877685547, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8611351847648621, "num_tokens": 256465041.0, "step": 6721 }, { "epoch": 0.855107492685409, "ewc_loss": 0.0859375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.866455078125e-05, "grad_norm": 30.87778091430664, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8476489186286926, "num_tokens": 256502576.0, "step": 6722 }, { "epoch": 0.8552347029639995, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.403488159179688, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.859568178653717, "num_tokens": 256540493.0, "step": 6723 }, { "epoch": 0.85536191324259, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.079252243041992, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8410689830780029, "num_tokens": 256578115.0, "step": 6724 }, { "epoch": 0.8554891235211806, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.239891052246094, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8645083904266357, "num_tokens": 256615957.0, "step": 6725 }, { "epoch": 0.855616333799771, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.060474395751953, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8603395223617554, "num_tokens": 256649972.0, "step": 6726 }, { "epoch": 0.8557435440783615, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.491474151611328, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8638178110122681, "num_tokens": 256686683.0, "step": 6727 }, { "epoch": 0.855870754356952, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.098983764648438, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8544228076934814, "num_tokens": 256723587.0, "step": 6728 }, { "epoch": 0.8559979646355426, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.304515838623047, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.856548011302948, "num_tokens": 256765245.0, "step": 6729 }, { "epoch": 0.8561251749141331, "ewc_loss": 0.08544921875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.818771362304688e-05, "grad_norm": 31.274581909179688, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8844723701477051, "num_tokens": 256806930.0, "step": 6730 }, { "epoch": 0.8562523851927236, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.26999855041504, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8556509017944336, "num_tokens": 256844568.0, "step": 6731 }, { "epoch": 0.856379595471314, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.32012367248535, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8543416857719421, "num_tokens": 256877483.0, "step": 6732 }, { "epoch": 0.8565068057499046, "ewc_loss": 0.0849609375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.771087646484375e-05, "grad_norm": 31.03995132446289, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8606679439544678, "num_tokens": 256921139.0, "step": 6733 }, { "epoch": 0.8566340160284951, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.65920639038086, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8578304648399353, "num_tokens": 256957861.0, "step": 6734 }, { "epoch": 0.8567612263070856, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.077159881591797, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8582863211631775, "num_tokens": 256998582.0, "step": 6735 }, { "epoch": 0.8568884365856761, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.477327346801758, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8559010624885559, "num_tokens": 257037638.0, "step": 6736 }, { "epoch": 0.8570156468642667, "ewc_loss": 0.08642578125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.914138793945312e-05, "grad_norm": 31.140649795532227, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8583710193634033, "num_tokens": 257074427.0, "step": 6737 }, { "epoch": 0.8571428571428571, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.414804458618164, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8604569435119629, "num_tokens": 257113826.0, "step": 6738 }, { "epoch": 0.8572700674214476, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.384233474731445, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8429381847381592, "num_tokens": 257152387.0, "step": 6739 }, { "epoch": 0.8573972777000382, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.266143798828125, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8579515218734741, "num_tokens": 257192083.0, "step": 6740 }, { "epoch": 0.8575244879786287, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.49266815185547, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.848750650882721, "num_tokens": 257235425.0, "step": 6741 }, { "epoch": 0.8576516982572192, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.4715576171875, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8534023761749268, "num_tokens": 257268621.0, "step": 6742 }, { "epoch": 0.8577789085358097, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.063282012939453, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8617525696754456, "num_tokens": 257306909.0, "step": 6743 }, { "epoch": 0.8579061188144002, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.805965423583984, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8694854974746704, "num_tokens": 257343388.0, "step": 6744 }, { "epoch": 0.8580333290929907, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 30.836650848388672, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8612442016601562, "num_tokens": 257377172.0, "step": 6745 }, { "epoch": 0.8581605393715812, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.78782081604004, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8497613668441772, "num_tokens": 257410312.0, "step": 6746 }, { "epoch": 0.8582877496501717, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.180883407592773, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8594250679016113, "num_tokens": 257449919.0, "step": 6747 }, { "epoch": 0.8584149599287623, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.485864639282227, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.867298424243927, "num_tokens": 257488296.0, "step": 6748 }, { "epoch": 0.8585421702073528, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.29705047607422, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8653745055198669, "num_tokens": 257524678.0, "step": 6749 }, { "epoch": 0.8586693804859432, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.426889419555664, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8619717359542847, "num_tokens": 257559060.0, "step": 6750 }, { "epoch": 0.8587965907645337, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.347749710083008, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8664507865905762, "num_tokens": 257594340.0, "step": 6751 }, { "epoch": 0.8589238010431243, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.695884704589844, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8682893514633179, "num_tokens": 257627936.0, "step": 6752 }, { "epoch": 0.8590510113217148, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.59001922607422, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8539251089096069, "num_tokens": 257660311.0, "step": 6753 }, { "epoch": 0.8591782216003053, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.321046829223633, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8494971990585327, "num_tokens": 257697496.0, "step": 6754 }, { "epoch": 0.8593054318788959, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.603166580200195, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8384872078895569, "num_tokens": 257737242.0, "step": 6755 }, { "epoch": 0.8594326421574863, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.495744705200195, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8584284782409668, "num_tokens": 257772946.0, "step": 6756 }, { "epoch": 0.8595598524360768, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.561742782592773, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8526901006698608, "num_tokens": 257807198.0, "step": 6757 }, { "epoch": 0.8596870627146673, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.521591186523438, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8632760047912598, "num_tokens": 257848618.0, "step": 6758 }, { "epoch": 0.8598142729932579, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.341838836669922, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8674632906913757, "num_tokens": 257887805.0, "step": 6759 }, { "epoch": 0.8599414832718484, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.541671752929688, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8636062145233154, "num_tokens": 257929276.0, "step": 6760 }, { "epoch": 0.8600686935504389, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.36564826965332, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8501318693161011, "num_tokens": 257958629.0, "step": 6761 }, { "epoch": 0.8601959038290294, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.451358795166016, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8536592125892639, "num_tokens": 257990303.0, "step": 6762 }, { "epoch": 0.8603231141076199, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.213594436645508, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8412083387374878, "num_tokens": 258028704.0, "step": 6763 }, { "epoch": 0.8604503243862104, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.709861755371094, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8656484484672546, "num_tokens": 258062757.0, "step": 6764 }, { "epoch": 0.8605775346648009, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.30722427368164, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8563617467880249, "num_tokens": 258098574.0, "step": 6765 }, { "epoch": 0.8607047449433914, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.475257873535156, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8663970232009888, "num_tokens": 258130937.0, "step": 6766 }, { "epoch": 0.860831955221982, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.6812744140625, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8595972061157227, "num_tokens": 258165142.0, "step": 6767 }, { "epoch": 0.8609591655005725, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.628877639770508, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8743860721588135, "num_tokens": 258203986.0, "step": 6768 }, { "epoch": 0.8610863757791629, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.79633331298828, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8573911190032959, "num_tokens": 258238692.0, "step": 6769 }, { "epoch": 0.8612135860577534, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.523727416992188, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8368653059005737, "num_tokens": 258280942.0, "step": 6770 }, { "epoch": 0.861340796336344, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.661680221557617, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.861946702003479, "num_tokens": 258318835.0, "step": 6771 }, { "epoch": 0.8614680066149345, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.4744930267334, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8473415970802307, "num_tokens": 258359461.0, "step": 6772 }, { "epoch": 0.861595216893525, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.77707290649414, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8536683320999146, "num_tokens": 258394582.0, "step": 6773 }, { "epoch": 0.8617224271721156, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.200576782226562, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.859808623790741, "num_tokens": 258438526.0, "step": 6774 }, { "epoch": 0.861849637450706, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.6532039642334, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8440026044845581, "num_tokens": 258479332.0, "step": 6775 }, { "epoch": 0.8619768477292965, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.654874801635742, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8481647968292236, "num_tokens": 258513202.0, "step": 6776 }, { "epoch": 0.862104058007887, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.42107582092285, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8593515157699585, "num_tokens": 258549145.0, "step": 6777 }, { "epoch": 0.8622312682864776, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.995075225830078, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8489515781402588, "num_tokens": 258586421.0, "step": 6778 }, { "epoch": 0.8623584785650681, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.57084846496582, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8559988141059875, "num_tokens": 258620014.0, "step": 6779 }, { "epoch": 0.8624856888436586, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.662235260009766, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8624616861343384, "num_tokens": 258656124.0, "step": 6780 }, { "epoch": 0.862612899122249, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.623764038085938, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8462029695510864, "num_tokens": 258692537.0, "step": 6781 }, { "epoch": 0.8627401094008396, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.74996566772461, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8613845705986023, "num_tokens": 258732737.0, "step": 6782 }, { "epoch": 0.8628673196794301, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.975507736206055, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8454511165618896, "num_tokens": 258770634.0, "step": 6783 }, { "epoch": 0.8629945299580206, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.567007064819336, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8573839068412781, "num_tokens": 258809643.0, "step": 6784 }, { "epoch": 0.8631217402366111, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.509967803955078, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8614190220832825, "num_tokens": 258845336.0, "step": 6785 }, { "epoch": 0.8632489505152017, "ewc_loss": 0.0869140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 6.961822509765625e-05, "grad_norm": 31.717031478881836, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8600522875785828, "num_tokens": 258882206.0, "step": 6786 }, { "epoch": 0.8633761607937921, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.58922004699707, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8642516732215881, "num_tokens": 258918661.0, "step": 6787 }, { "epoch": 0.8635033710723826, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.762367248535156, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8521233201026917, "num_tokens": 258957452.0, "step": 6788 }, { "epoch": 0.8636305813509731, "ewc_loss": 0.08740234375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.009506225585938e-05, "grad_norm": 31.73227310180664, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8482732772827148, "num_tokens": 259003507.0, "step": 6789 }, { "epoch": 0.8637577916295637, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.732221603393555, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8519855737686157, "num_tokens": 259042284.0, "step": 6790 }, { "epoch": 0.8638850019081542, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.642623901367188, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8709337711334229, "num_tokens": 259082929.0, "step": 6791 }, { "epoch": 0.8640122121867447, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.587169647216797, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8715029954910278, "num_tokens": 259125920.0, "step": 6792 }, { "epoch": 0.8641394224653351, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.90166664123535, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8579633235931396, "num_tokens": 259163140.0, "step": 6793 }, { "epoch": 0.8642666327439257, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.677751541137695, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8552051782608032, "num_tokens": 259199038.0, "step": 6794 }, { "epoch": 0.8643938430225162, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.901809692382812, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.854644238948822, "num_tokens": 259235713.0, "step": 6795 }, { "epoch": 0.8645210533011067, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.542648315429688, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8466958403587341, "num_tokens": 259278314.0, "step": 6796 }, { "epoch": 0.8646482635796973, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.6574649810791, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8641544580459595, "num_tokens": 259313039.0, "step": 6797 }, { "epoch": 0.8647754738582878, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.880111694335938, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.863798975944519, "num_tokens": 259348845.0, "step": 6798 }, { "epoch": 0.8649026841368782, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.659902572631836, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8511611819267273, "num_tokens": 259387345.0, "step": 6799 }, { "epoch": 0.8650298944154687, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.843183517456055, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8569180369377136, "num_tokens": 259427835.0, "step": 6800 }, { "epoch": 0.8651571046940593, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.98860740661621, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.861138641834259, "num_tokens": 259460453.0, "step": 6801 }, { "epoch": 0.8652843149726498, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.764244079589844, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.863461971282959, "num_tokens": 259499296.0, "step": 6802 }, { "epoch": 0.8654115252512403, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.841808319091797, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8531466126441956, "num_tokens": 259537649.0, "step": 6803 }, { "epoch": 0.8655387355298308, "ewc_loss": 0.087890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.05718994140625e-05, "grad_norm": 31.412860870361328, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8497961163520813, "num_tokens": 259583136.0, "step": 6804 }, { "epoch": 0.8656659458084213, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.840229034423828, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8616795539855957, "num_tokens": 259620422.0, "step": 6805 }, { "epoch": 0.8657931560870118, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.677162170410156, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8451094627380371, "num_tokens": 259663948.0, "step": 6806 }, { "epoch": 0.8659203663656023, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.152557373046875e-05, "grad_norm": 31.528358459472656, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8591523766517639, "num_tokens": 259701986.0, "step": 6807 }, { "epoch": 0.8660475766441929, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.772857666015625, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8726975917816162, "num_tokens": 259740684.0, "step": 6808 }, { "epoch": 0.8661747869227834, "ewc_loss": 0.08837890625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.104873657226562e-05, "grad_norm": 31.37256622314453, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8406956195831299, "num_tokens": 259781659.0, "step": 6809 }, { "epoch": 0.8663019972013739, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.05268859863281, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8508538007736206, "num_tokens": 259818554.0, "step": 6810 }, { "epoch": 0.8664292074799644, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.45964813232422, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8591780662536621, "num_tokens": 259856652.0, "step": 6811 }, { "epoch": 0.8665564177585549, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.15594482421875, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8763746619224548, "num_tokens": 259898255.0, "step": 6812 }, { "epoch": 0.8666836280371454, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.558053970336914, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8625498414039612, "num_tokens": 259935305.0, "step": 6813 }, { "epoch": 0.8668108383157359, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.907052993774414, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8615415096282959, "num_tokens": 259975245.0, "step": 6814 }, { "epoch": 0.8669380485943264, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.615535736083984, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.862952709197998, "num_tokens": 260014150.0, "step": 6815 }, { "epoch": 0.867065258872917, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.82036781311035, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8473591804504395, "num_tokens": 260054655.0, "step": 6816 }, { "epoch": 0.8671924691515075, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.673240661621094, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.857243537902832, "num_tokens": 260091528.0, "step": 6817 }, { "epoch": 0.8673196794300979, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.499692916870117, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.872880220413208, "num_tokens": 260123069.0, "step": 6818 }, { "epoch": 0.8674468897086884, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.94420051574707, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8418639898300171, "num_tokens": 260154482.0, "step": 6819 }, { "epoch": 0.867574099987279, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.440378189086914, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8593339323997498, "num_tokens": 260193983.0, "step": 6820 }, { "epoch": 0.8677013102658695, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.133445739746094, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8595688939094543, "num_tokens": 260228705.0, "step": 6821 }, { "epoch": 0.86782852054446, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.665945053100586, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8562679886817932, "num_tokens": 260262821.0, "step": 6822 }, { "epoch": 0.8679557308230506, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.951629638671875, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8551838397979736, "num_tokens": 260298949.0, "step": 6823 }, { "epoch": 0.868082941101641, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.648942947387695, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8638825416564941, "num_tokens": 260340028.0, "step": 6824 }, { "epoch": 0.8682101513802315, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.905113220214844, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8555119037628174, "num_tokens": 260381328.0, "step": 6825 }, { "epoch": 0.868337361658822, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.636749267578125, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8580933809280396, "num_tokens": 260423421.0, "step": 6826 }, { "epoch": 0.8684645719374126, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.839096069335938, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8551637530326843, "num_tokens": 260461985.0, "step": 6827 }, { "epoch": 0.8685917822160031, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.859590530395508, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.842101514339447, "num_tokens": 260503041.0, "step": 6828 }, { "epoch": 0.8687189924945936, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.74306869506836, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8656519651412964, "num_tokens": 260545553.0, "step": 6829 }, { "epoch": 0.868846202773184, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.232421875, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8634542226791382, "num_tokens": 260577160.0, "step": 6830 }, { "epoch": 0.8689734130517746, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.6025333404541, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8701296448707581, "num_tokens": 260614748.0, "step": 6831 }, { "epoch": 0.8691006233303651, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.900415420532227, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.857641339302063, "num_tokens": 260651762.0, "step": 6832 }, { "epoch": 0.8692278336089556, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.79116439819336, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8447124361991882, "num_tokens": 260694438.0, "step": 6833 }, { "epoch": 0.8693550438875461, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.778953552246094, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8652083873748779, "num_tokens": 260730256.0, "step": 6834 }, { "epoch": 0.8694822541661367, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.887489318847656, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.83709716796875, "num_tokens": 260766190.0, "step": 6835 }, { "epoch": 0.8696094644447271, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.523263931274414, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8717557787895203, "num_tokens": 260801733.0, "step": 6836 }, { "epoch": 0.8697366747233176, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.961137771606445, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.868402361869812, "num_tokens": 260843461.0, "step": 6837 }, { "epoch": 0.8698638850019081, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.73665428161621, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8562599420547485, "num_tokens": 260875006.0, "step": 6838 }, { "epoch": 0.8699910952804987, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.28437805175781, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8508304953575134, "num_tokens": 260917553.0, "step": 6839 }, { "epoch": 0.8701183055590892, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 31.538267135620117, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.833228349685669, "num_tokens": 260955293.0, "step": 6840 }, { "epoch": 0.8702455158376797, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.34377670288086, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8575443029403687, "num_tokens": 260983966.0, "step": 6841 }, { "epoch": 0.8703727261162701, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.808151245117188, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8704442977905273, "num_tokens": 261019434.0, "step": 6842 }, { "epoch": 0.8704999363948607, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.1407356262207, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.84433913230896, "num_tokens": 261053761.0, "step": 6843 }, { "epoch": 0.8706271466734512, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.771453857421875, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8483282327651978, "num_tokens": 261093490.0, "step": 6844 }, { "epoch": 0.8707543569520417, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.120426177978516, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8583255410194397, "num_tokens": 261129518.0, "step": 6845 }, { "epoch": 0.8708815672306323, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.79750633239746, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8408738374710083, "num_tokens": 261164282.0, "step": 6846 }, { "epoch": 0.8710087775092228, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.06174087524414, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8367823362350464, "num_tokens": 261205055.0, "step": 6847 }, { "epoch": 0.8711359877878132, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.941051483154297, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8507634997367859, "num_tokens": 261242329.0, "step": 6848 }, { "epoch": 0.8712631980664037, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.542558670043945, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8303453326225281, "num_tokens": 261280330.0, "step": 6849 }, { "epoch": 0.8713904083449943, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.046077728271484, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8678464889526367, "num_tokens": 261318232.0, "step": 6850 }, { "epoch": 0.8715176186235848, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.77433204650879, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8727010488510132, "num_tokens": 261351483.0, "step": 6851 }, { "epoch": 0.8716448289021753, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.851072311401367, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8593782782554626, "num_tokens": 261387058.0, "step": 6852 }, { "epoch": 0.8717720391807658, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.062042236328125, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8734354972839355, "num_tokens": 261430110.0, "step": 6853 }, { "epoch": 0.8718992494593563, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.00847244262695, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8642243146896362, "num_tokens": 261470707.0, "step": 6854 }, { "epoch": 0.8720264597379468, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.969165802001953, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8641301393508911, "num_tokens": 261507146.0, "step": 6855 }, { "epoch": 0.8721536700165373, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.84052276611328, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8420411944389343, "num_tokens": 261549723.0, "step": 6856 }, { "epoch": 0.8722808802951278, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.79100799560547, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8627427816390991, "num_tokens": 261591030.0, "step": 6857 }, { "epoch": 0.8724080905737184, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.13608169555664, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8663586378097534, "num_tokens": 261623919.0, "step": 6858 }, { "epoch": 0.8725353008523089, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.81301498413086, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8735495805740356, "num_tokens": 261655277.0, "step": 6859 }, { "epoch": 0.8726625111308994, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.07809829711914, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8449812531471252, "num_tokens": 261697462.0, "step": 6860 }, { "epoch": 0.8727897214094898, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.761688232421875, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8551175594329834, "num_tokens": 261739134.0, "step": 6861 }, { "epoch": 0.8729169316880804, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.084678649902344, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8601951599121094, "num_tokens": 261776295.0, "step": 6862 }, { "epoch": 0.8730441419666709, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 31.983163833618164, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8604375123977661, "num_tokens": 261812388.0, "step": 6863 }, { "epoch": 0.8731713522452614, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.23588943481445, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8591927886009216, "num_tokens": 261855537.0, "step": 6864 }, { "epoch": 0.873298562523852, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.98828125, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8668748736381531, "num_tokens": 261899922.0, "step": 6865 }, { "epoch": 0.8734257728024425, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.423118591308594, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8694759011268616, "num_tokens": 261934475.0, "step": 6866 }, { "epoch": 0.8735529830810329, "ewc_loss": 0.0888671875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.200241088867188e-05, "grad_norm": 32.04007339477539, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8586677312850952, "num_tokens": 261973395.0, "step": 6867 }, { "epoch": 0.8736801933596234, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.924436569213867, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8414560556411743, "num_tokens": 262011644.0, "step": 6868 }, { "epoch": 0.873807403638214, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.18822479248047, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8527418375015259, "num_tokens": 262044038.0, "step": 6869 }, { "epoch": 0.8739346139168045, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.349151611328125, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8570082187652588, "num_tokens": 262081023.0, "step": 6870 }, { "epoch": 0.874061824195395, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.127681732177734, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8503671884536743, "num_tokens": 262123981.0, "step": 6871 }, { "epoch": 0.8741890344739855, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.07643127441406, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8470975160598755, "num_tokens": 262161542.0, "step": 6872 }, { "epoch": 0.874316244752576, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.32628631591797, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.847318172454834, "num_tokens": 262202482.0, "step": 6873 }, { "epoch": 0.8744434550311665, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.91620445251465, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.859032928943634, "num_tokens": 262239499.0, "step": 6874 }, { "epoch": 0.874570665309757, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.51259994506836, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8595349788665771, "num_tokens": 262281324.0, "step": 6875 }, { "epoch": 0.8746978755883476, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.1318473815918, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8497918844223022, "num_tokens": 262318772.0, "step": 6876 }, { "epoch": 0.8748250858669381, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.92352294921875, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8695471286773682, "num_tokens": 262353264.0, "step": 6877 }, { "epoch": 0.8749522961455286, "ewc_loss": 0.08935546875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.2479248046875e-05, "grad_norm": 32.17791748046875, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8542159795761108, "num_tokens": 262393360.0, "step": 6878 }, { "epoch": 0.875079506424119, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.958833694458008, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.868228554725647, "num_tokens": 262432010.0, "step": 6879 }, { "epoch": 0.8752067167027096, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.23346710205078, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8568586707115173, "num_tokens": 262468828.0, "step": 6880 }, { "epoch": 0.8753339269813001, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.090179443359375, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8646732568740845, "num_tokens": 262515141.0, "step": 6881 }, { "epoch": 0.8754611372598906, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.304203033447266, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8549662232398987, "num_tokens": 262553137.0, "step": 6882 }, { "epoch": 0.8755883475384811, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.935976028442383, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8794851899147034, "num_tokens": 262593847.0, "step": 6883 }, { "epoch": 0.8757155578170717, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.27581024169922, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8457217216491699, "num_tokens": 262635048.0, "step": 6884 }, { "epoch": 0.8758427680956621, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.199615478515625, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8495764136314392, "num_tokens": 262671200.0, "step": 6885 }, { "epoch": 0.8759699783742526, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.12324905395508, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8597410321235657, "num_tokens": 262708304.0, "step": 6886 }, { "epoch": 0.8760971886528431, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.269351959228516, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8547548055648804, "num_tokens": 262750680.0, "step": 6887 }, { "epoch": 0.8762243989314337, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 32.29527282714844, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.852091908454895, "num_tokens": 262784845.0, "step": 6888 }, { "epoch": 0.8763516092100242, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.922443389892578, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8575053215026855, "num_tokens": 262822646.0, "step": 6889 }, { "epoch": 0.8764788194886147, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.16518020629883, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8538498878479004, "num_tokens": 262863663.0, "step": 6890 }, { "epoch": 0.8766060297672051, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.871522903442383, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.844322681427002, "num_tokens": 262903726.0, "step": 6891 }, { "epoch": 0.8767332400457957, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.93329620361328, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8462318181991577, "num_tokens": 262942748.0, "step": 6892 }, { "epoch": 0.8768604503243862, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.72783088684082, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8369032740592957, "num_tokens": 262977639.0, "step": 6893 }, { "epoch": 0.8769876606029767, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.3820915222168, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8610730767250061, "num_tokens": 263015970.0, "step": 6894 }, { "epoch": 0.8771148708815673, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.792232513427734, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8677856922149658, "num_tokens": 263058618.0, "step": 6895 }, { "epoch": 0.8772420811601578, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.16590118408203, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8400985598564148, "num_tokens": 263098628.0, "step": 6896 }, { "epoch": 0.8773692914387482, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.27278137207031, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8605276346206665, "num_tokens": 263136793.0, "step": 6897 }, { "epoch": 0.8774965017173387, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.917814254760742, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8603535890579224, "num_tokens": 263170996.0, "step": 6898 }, { "epoch": 0.8776237119959293, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 32.39716720581055, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8475478887557983, "num_tokens": 263199961.0, "step": 6899 }, { "epoch": 0.8777509222745198, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.75015640258789, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8553223609924316, "num_tokens": 263241541.0, "step": 6900 }, { "epoch": 0.8778781325531103, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.37980651855469, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8700266480445862, "num_tokens": 263278778.0, "step": 6901 }, { "epoch": 0.8780053428317008, "ewc_loss": 0.08984375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.295608520507812e-05, "grad_norm": 31.980417251586914, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8612852096557617, "num_tokens": 263310155.0, "step": 6902 }, { "epoch": 0.8781325531102913, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.23996353149414, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8536878824234009, "num_tokens": 263355793.0, "step": 6903 }, { "epoch": 0.8782597633888818, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.968725204467773, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8668269515037537, "num_tokens": 263394446.0, "step": 6904 }, { "epoch": 0.8783869736674723, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.45513153076172, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8597740530967712, "num_tokens": 263438479.0, "step": 6905 }, { "epoch": 0.8785141839460628, "ewc_loss": 0.09033203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.343292236328125e-05, "grad_norm": 31.770301818847656, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.842223048210144, "num_tokens": 263478514.0, "step": 6906 }, { "epoch": 0.8786413942246534, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.350006103515625, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8381789922714233, "num_tokens": 263516734.0, "step": 6907 }, { "epoch": 0.8787686045032439, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.159461975097656, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8484063744544983, "num_tokens": 263560105.0, "step": 6908 }, { "epoch": 0.8788958147818343, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.080360412597656, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8586453795433044, "num_tokens": 263597033.0, "step": 6909 }, { "epoch": 0.8790230250604248, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.334617614746094, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8645444512367249, "num_tokens": 263631106.0, "step": 6910 }, { "epoch": 0.8791502353390154, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.05733871459961, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.851597785949707, "num_tokens": 263668227.0, "step": 6911 }, { "epoch": 0.8792774456176059, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.11581039428711, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.859138011932373, "num_tokens": 263704335.0, "step": 6912 }, { "epoch": 0.8794046558961964, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.00669860839844, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8632455468177795, "num_tokens": 263742532.0, "step": 6913 }, { "epoch": 0.879531866174787, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.986852645874023, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8581444025039673, "num_tokens": 263782321.0, "step": 6914 }, { "epoch": 0.8796590764533775, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.14344787597656, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8649600744247437, "num_tokens": 263818037.0, "step": 6915 }, { "epoch": 0.8797862867319679, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.25401306152344, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8428566455841064, "num_tokens": 263854901.0, "step": 6916 }, { "epoch": 0.8799134970105584, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.07969665527344, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8622246980667114, "num_tokens": 263896241.0, "step": 6917 }, { "epoch": 0.880040707289149, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.04387664794922, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8590666055679321, "num_tokens": 263934673.0, "step": 6918 }, { "epoch": 0.8801679175677395, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.85580825805664, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8501358032226562, "num_tokens": 263973456.0, "step": 6919 }, { "epoch": 0.88029512784633, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.14806365966797, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8405628204345703, "num_tokens": 264006050.0, "step": 6920 }, { "epoch": 0.8804223381249205, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.21403884887695, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8668608069419861, "num_tokens": 264042736.0, "step": 6921 }, { "epoch": 0.880549548403511, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.833181381225586, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8485796451568604, "num_tokens": 264084433.0, "step": 6922 }, { "epoch": 0.8806767586821015, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.182525634765625, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8644901514053345, "num_tokens": 264118331.0, "step": 6923 }, { "epoch": 0.880803968960692, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.10395050048828, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8597954511642456, "num_tokens": 264152015.0, "step": 6924 }, { "epoch": 0.8809311792392825, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.05181884765625, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8634728193283081, "num_tokens": 264187085.0, "step": 6925 }, { "epoch": 0.8810583895178731, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.14820861816406, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8543460369110107, "num_tokens": 264225558.0, "step": 6926 }, { "epoch": 0.8811855997964636, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.978748321533203, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8484361171722412, "num_tokens": 264264220.0, "step": 6927 }, { "epoch": 0.881312810075054, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.41206741333008, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8611450791358948, "num_tokens": 264302097.0, "step": 6928 }, { "epoch": 0.8814400203536445, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.937597274780273, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8589056134223938, "num_tokens": 264341966.0, "step": 6929 }, { "epoch": 0.8815672306322351, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.55550765991211, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8337721824645996, "num_tokens": 264384298.0, "step": 6930 }, { "epoch": 0.8816944409108256, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.775447845458984, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8583112955093384, "num_tokens": 264422513.0, "step": 6931 }, { "epoch": 0.8818216511894161, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.58150100708008, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8702636957168579, "num_tokens": 264461819.0, "step": 6932 }, { "epoch": 0.8819488614680067, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.98607063293457, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8468198776245117, "num_tokens": 264502618.0, "step": 6933 }, { "epoch": 0.8820760717465971, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.304298400878906, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8512297868728638, "num_tokens": 264534382.0, "step": 6934 }, { "epoch": 0.8822032820251876, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.25182342529297, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8466553688049316, "num_tokens": 264575297.0, "step": 6935 }, { "epoch": 0.8823304923037781, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.20057678222656, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8627113103866577, "num_tokens": 264611822.0, "step": 6936 }, { "epoch": 0.8824577025823687, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.175445556640625, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8529146909713745, "num_tokens": 264657483.0, "step": 6937 }, { "epoch": 0.8825849128609592, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.28484344482422, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8616607189178467, "num_tokens": 264690633.0, "step": 6938 }, { "epoch": 0.8827121231395497, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.854293823242188, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8715998530387878, "num_tokens": 264733268.0, "step": 6939 }, { "epoch": 0.8828393334181401, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.41499710083008, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8512057065963745, "num_tokens": 264774769.0, "step": 6940 }, { "epoch": 0.8829665436967307, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.880380630493164, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.853364109992981, "num_tokens": 264806658.0, "step": 6941 }, { "epoch": 0.8830937539753212, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.286869049072266, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8448948264122009, "num_tokens": 264841054.0, "step": 6942 }, { "epoch": 0.8832209642539117, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.45378494262695, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8518890738487244, "num_tokens": 264880241.0, "step": 6943 }, { "epoch": 0.8833481745325023, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.07625198364258, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8588209748268127, "num_tokens": 264922449.0, "step": 6944 }, { "epoch": 0.8834753848110928, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.18219757080078, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8566046357154846, "num_tokens": 264961605.0, "step": 6945 }, { "epoch": 0.8836025950896832, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.39352798461914, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8651079535484314, "num_tokens": 264992924.0, "step": 6946 }, { "epoch": 0.8837298053682737, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.147911071777344, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8564164638519287, "num_tokens": 265031849.0, "step": 6947 }, { "epoch": 0.8838570156468643, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.176979064941406, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8685775995254517, "num_tokens": 265069714.0, "step": 6948 }, { "epoch": 0.8839842259254548, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.47636032104492, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8512141704559326, "num_tokens": 265103325.0, "step": 6949 }, { "epoch": 0.8841114362040453, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 31.907695770263672, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8357857465744019, "num_tokens": 265139133.0, "step": 6950 }, { "epoch": 0.8842386464826358, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.4969482421875, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8324320316314697, "num_tokens": 265183536.0, "step": 6951 }, { "epoch": 0.8843658567612263, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.1134033203125, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8492438197135925, "num_tokens": 265226288.0, "step": 6952 }, { "epoch": 0.8844930670398168, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.26092529296875, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8557438254356384, "num_tokens": 265265846.0, "step": 6953 }, { "epoch": 0.8846202773184073, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.265907287597656, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8707084655761719, "num_tokens": 265300612.0, "step": 6954 }, { "epoch": 0.8847474875969978, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.01435089111328, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8658874034881592, "num_tokens": 265341440.0, "step": 6955 }, { "epoch": 0.8848746978755884, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.48223114013672, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.84593665599823, "num_tokens": 265384854.0, "step": 6956 }, { "epoch": 0.8850019081541789, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.21366500854492, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8492043614387512, "num_tokens": 265420611.0, "step": 6957 }, { "epoch": 0.8851291184327693, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.13790512084961, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.854644775390625, "num_tokens": 265457720.0, "step": 6958 }, { "epoch": 0.8852563287113598, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.30617141723633, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8657963275909424, "num_tokens": 265493434.0, "step": 6959 }, { "epoch": 0.8853835389899504, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.05222702026367, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8590133190155029, "num_tokens": 265529015.0, "step": 6960 }, { "epoch": 0.8855107492685409, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.35200881958008, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8415616154670715, "num_tokens": 265568698.0, "step": 6961 }, { "epoch": 0.8856379595471314, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.20099639892578, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8581542372703552, "num_tokens": 265605631.0, "step": 6962 }, { "epoch": 0.885765169825722, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.431434631347656, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8582229018211365, "num_tokens": 265647071.0, "step": 6963 }, { "epoch": 0.8858923801043125, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.32891845703125, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8562150001525879, "num_tokens": 265688680.0, "step": 6964 }, { "epoch": 0.8860195903829029, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.32344055175781, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8626807332038879, "num_tokens": 265727707.0, "step": 6965 }, { "epoch": 0.8861468006614934, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.325294494628906, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8542214632034302, "num_tokens": 265765860.0, "step": 6966 }, { "epoch": 0.886274010940084, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.25702667236328, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8596303462982178, "num_tokens": 265802988.0, "step": 6967 }, { "epoch": 0.8864012212186745, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.33541488647461, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8503837585449219, "num_tokens": 265840614.0, "step": 6968 }, { "epoch": 0.886528431497265, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.926612854003906, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8304239511489868, "num_tokens": 265883032.0, "step": 6969 }, { "epoch": 0.8866556417758555, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.48861312866211, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8684845566749573, "num_tokens": 265918309.0, "step": 6970 }, { "epoch": 0.886782852054446, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.22529602050781, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8594579100608826, "num_tokens": 265956406.0, "step": 6971 }, { "epoch": 0.8869100623330365, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.31184387207031, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.864925742149353, "num_tokens": 265988413.0, "step": 6972 }, { "epoch": 0.887037272611627, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.12867736816406, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8555356860160828, "num_tokens": 266022036.0, "step": 6973 }, { "epoch": 0.8871644828902175, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.32109069824219, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.855267345905304, "num_tokens": 266053735.0, "step": 6974 }, { "epoch": 0.8872916931688081, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.03621292114258, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.87089604139328, "num_tokens": 266094198.0, "step": 6975 }, { "epoch": 0.8874189034473986, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.27653503417969, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8655070066452026, "num_tokens": 266132213.0, "step": 6976 }, { "epoch": 0.887546113725989, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.13521194458008, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8573703765869141, "num_tokens": 266168001.0, "step": 6977 }, { "epoch": 0.8876733240045795, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.24323654174805, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8269876837730408, "num_tokens": 266208771.0, "step": 6978 }, { "epoch": 0.8878005342831701, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.30499267578125, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8612702488899231, "num_tokens": 266248090.0, "step": 6979 }, { "epoch": 0.8879277445617606, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.2181396484375, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8462576270103455, "num_tokens": 266283748.0, "step": 6980 }, { "epoch": 0.8880549548403511, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 31.8501033782959, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8645663857460022, "num_tokens": 266316613.0, "step": 6981 }, { "epoch": 0.8881821651189417, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.27264404296875, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.859286904335022, "num_tokens": 266357515.0, "step": 6982 }, { "epoch": 0.8883093753975321, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.1494140625, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8681648969650269, "num_tokens": 266393593.0, "step": 6983 }, { "epoch": 0.8884365856761226, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.14639663696289, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8692773580551147, "num_tokens": 266436104.0, "step": 6984 }, { "epoch": 0.8885637959547131, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.23883056640625, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8691933155059814, "num_tokens": 266472809.0, "step": 6985 }, { "epoch": 0.8886910062333037, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.25808334350586, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8678475022315979, "num_tokens": 266512726.0, "step": 6986 }, { "epoch": 0.8888182165118942, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.319786071777344, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8466149568557739, "num_tokens": 266546098.0, "step": 6987 }, { "epoch": 0.8889454267904847, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.07041931152344, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.847907304763794, "num_tokens": 266588478.0, "step": 6988 }, { "epoch": 0.8890726370690751, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.456031799316406, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8453953266143799, "num_tokens": 266628975.0, "step": 6989 }, { "epoch": 0.8891998473476657, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.816211700439453, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8662246465682983, "num_tokens": 266667533.0, "step": 6990 }, { "epoch": 0.8893270576262562, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.54154586791992, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.839264988899231, "num_tokens": 266706990.0, "step": 6991 }, { "epoch": 0.8894542679048467, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.903650283813477, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8433013558387756, "num_tokens": 266747314.0, "step": 6992 }, { "epoch": 0.8895814781834372, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.68490982055664, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8474487066268921, "num_tokens": 266791031.0, "step": 6993 }, { "epoch": 0.8897086884620278, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.9562931060791, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8567975163459778, "num_tokens": 266823236.0, "step": 6994 }, { "epoch": 0.8898358987406182, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.85783767700195, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8549681901931763, "num_tokens": 266867993.0, "step": 6995 }, { "epoch": 0.8899631090192087, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 31.959129333496094, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8392927646636963, "num_tokens": 266908573.0, "step": 6996 }, { "epoch": 0.8900903192977992, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.21387481689453, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8545664548873901, "num_tokens": 266951167.0, "step": 6997 }, { "epoch": 0.8902175295763898, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.04429626464844, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8546417355537415, "num_tokens": 266988976.0, "step": 6998 }, { "epoch": 0.8903447398549803, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.48910140991211, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8550403118133545, "num_tokens": 267022450.0, "step": 6999 }, { "epoch": 0.8904719501335708, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 31.918127059936523, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8502458333969116, "num_tokens": 267059933.0, "step": 7000 }, { "epoch": 0.8905991604121613, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.75979995727539, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8487376570701599, "num_tokens": 267092518.0, "step": 7001 }, { "epoch": 0.8907263706907518, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.23879623413086, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8496341109275818, "num_tokens": 267130008.0, "step": 7002 }, { "epoch": 0.8908535809693423, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.388885498046875, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.859669029712677, "num_tokens": 267171039.0, "step": 7003 }, { "epoch": 0.8909807912479328, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.587562561035156, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8589562773704529, "num_tokens": 267211197.0, "step": 7004 }, { "epoch": 0.8911080015265234, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.26985168457031, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8617977499961853, "num_tokens": 267247471.0, "step": 7005 }, { "epoch": 0.8912352118051139, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.55488204956055, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8291398286819458, "num_tokens": 267287276.0, "step": 7006 }, { "epoch": 0.8913624220837043, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.185089111328125, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.853226900100708, "num_tokens": 267323827.0, "step": 7007 }, { "epoch": 0.8914896323622948, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.348114013671875, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8658619523048401, "num_tokens": 267364053.0, "step": 7008 }, { "epoch": 0.8916168426408854, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.4057731628418, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8397630453109741, "num_tokens": 267404393.0, "step": 7009 }, { "epoch": 0.8917440529194759, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.65757751464844, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8566818833351135, "num_tokens": 267444589.0, "step": 7010 }, { "epoch": 0.8918712631980664, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.17995834350586, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8557389974594116, "num_tokens": 267484460.0, "step": 7011 }, { "epoch": 0.891998473476657, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.488704681396484, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8496148586273193, "num_tokens": 267517390.0, "step": 7012 }, { "epoch": 0.8921256837552475, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.17195510864258, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8570752143859863, "num_tokens": 267555813.0, "step": 7013 }, { "epoch": 0.8922528940338379, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.69352340698242, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8534935712814331, "num_tokens": 267595818.0, "step": 7014 }, { "epoch": 0.8923801043124284, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.20273971557617, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8295397162437439, "num_tokens": 267633106.0, "step": 7015 }, { "epoch": 0.892507314591019, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.60161590576172, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8593604564666748, "num_tokens": 267670166.0, "step": 7016 }, { "epoch": 0.8926345248696095, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.33949661254883, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8535865545272827, "num_tokens": 267707284.0, "step": 7017 }, { "epoch": 0.8927617351482, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.53685760498047, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8573189377784729, "num_tokens": 267742349.0, "step": 7018 }, { "epoch": 0.8928889454267905, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.30428695678711, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8612452149391174, "num_tokens": 267776003.0, "step": 7019 }, { "epoch": 0.893016155705381, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.5589599609375, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8512821197509766, "num_tokens": 267815887.0, "step": 7020 }, { "epoch": 0.8931433659839715, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.28681945800781, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8662409782409668, "num_tokens": 267855874.0, "step": 7021 }, { "epoch": 0.893270576262562, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.60808181762695, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8670907020568848, "num_tokens": 267897901.0, "step": 7022 }, { "epoch": 0.8933977865411525, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.192970275878906, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.867198646068573, "num_tokens": 267938967.0, "step": 7023 }, { "epoch": 0.8935249968197431, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.719268798828125, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8528222441673279, "num_tokens": 267975196.0, "step": 7024 }, { "epoch": 0.8936522070983336, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.57680130004883, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8700249195098877, "num_tokens": 268014407.0, "step": 7025 }, { "epoch": 0.893779417376924, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.99428939819336, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8701128959655762, "num_tokens": 268043892.0, "step": 7026 }, { "epoch": 0.8939066276555145, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.36751937866211, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8561680316925049, "num_tokens": 268079651.0, "step": 7027 }, { "epoch": 0.8940338379341051, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.67662048339844, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8595683574676514, "num_tokens": 268114021.0, "step": 7028 }, { "epoch": 0.8941610482126956, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.56833267211914, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.855930507183075, "num_tokens": 268155315.0, "step": 7029 }, { "epoch": 0.8942882584912861, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.33906936645508, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8531829118728638, "num_tokens": 268199312.0, "step": 7030 }, { "epoch": 0.8944154687698767, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.51298141479492, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8635663390159607, "num_tokens": 268236955.0, "step": 7031 }, { "epoch": 0.8945426790484671, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.24100112915039, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.853394627571106, "num_tokens": 268272183.0, "step": 7032 }, { "epoch": 0.8946698893270576, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 33.05704116821289, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8473726511001587, "num_tokens": 268305407.0, "step": 7033 }, { "epoch": 0.8947970996056481, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.31050491333008, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8663409948348999, "num_tokens": 268340443.0, "step": 7034 }, { "epoch": 0.8949243098842387, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.933921813964844, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8748588562011719, "num_tokens": 268375597.0, "step": 7035 }, { "epoch": 0.8950515201628292, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.31354522705078, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8509286642074585, "num_tokens": 268412872.0, "step": 7036 }, { "epoch": 0.8951787304414197, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 33.0217170715332, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8568429946899414, "num_tokens": 268452021.0, "step": 7037 }, { "epoch": 0.8953059407200101, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.30963897705078, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.864592969417572, "num_tokens": 268486354.0, "step": 7038 }, { "epoch": 0.8954331509986007, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.99785614013672, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8506890535354614, "num_tokens": 268519918.0, "step": 7039 }, { "epoch": 0.8955603612771912, "ewc_loss": 0.09130859375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.43865966796875e-05, "grad_norm": 32.86978530883789, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8473771810531616, "num_tokens": 268554259.0, "step": 7040 }, { "epoch": 0.8956875715557817, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.88916778564453, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8508301377296448, "num_tokens": 268595785.0, "step": 7041 }, { "epoch": 0.8958147818343722, "ewc_loss": 0.0908203125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.390975952148438e-05, "grad_norm": 32.466758728027344, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8413555026054382, "num_tokens": 268640805.0, "step": 7042 }, { "epoch": 0.8959419921129628, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.61457824707031, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8614277839660645, "num_tokens": 268682446.0, "step": 7043 }, { "epoch": 0.8960692023915532, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.73942565917969, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8651654720306396, "num_tokens": 268720262.0, "step": 7044 }, { "epoch": 0.8961964126701437, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.32186508178711, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8446683883666992, "num_tokens": 268760519.0, "step": 7045 }, { "epoch": 0.8963236229487342, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.70827865600586, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.848608136177063, "num_tokens": 268796621.0, "step": 7046 }, { "epoch": 0.8964508332273248, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.35226058959961, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.863938570022583, "num_tokens": 268830527.0, "step": 7047 }, { "epoch": 0.8965780435059153, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.76266098022461, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8620007634162903, "num_tokens": 268862112.0, "step": 7048 }, { "epoch": 0.8967052537845058, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.36591720581055, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8552632331848145, "num_tokens": 268901089.0, "step": 7049 }, { "epoch": 0.8968324640630962, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.64263916015625, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8591927886009216, "num_tokens": 268942766.0, "step": 7050 }, { "epoch": 0.8969596743416868, "ewc_loss": 0.091796875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.486343383789062e-05, "grad_norm": 32.47728729248047, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8624537587165833, "num_tokens": 268978642.0, "step": 7051 }, { "epoch": 0.8970868846202773, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.2479248046875, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8624587059020996, "num_tokens": 269016048.0, "step": 7052 }, { "epoch": 0.8972140948988678, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.5455436706543, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8576933145523071, "num_tokens": 269054381.0, "step": 7053 }, { "epoch": 0.8973413051774584, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.36934280395508, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8603034019470215, "num_tokens": 269090672.0, "step": 7054 }, { "epoch": 0.8974685154560489, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.51127624511719, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8602433800697327, "num_tokens": 269130231.0, "step": 7055 }, { "epoch": 0.8975957257346393, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.56840515136719, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8606871366500854, "num_tokens": 269166282.0, "step": 7056 }, { "epoch": 0.8977229360132298, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.51736831665039, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8594350814819336, "num_tokens": 269205326.0, "step": 7057 }, { "epoch": 0.8978501462918204, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.69002914428711, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8585354089736938, "num_tokens": 269243903.0, "step": 7058 }, { "epoch": 0.8979773565704109, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.741886138916016, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8509537577629089, "num_tokens": 269288330.0, "step": 7059 }, { "epoch": 0.8981045668490014, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.44878387451172, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8465109467506409, "num_tokens": 269328731.0, "step": 7060 }, { "epoch": 0.898231777127592, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.614078521728516, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8580425977706909, "num_tokens": 269369576.0, "step": 7061 }, { "epoch": 0.8983589874061825, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.49822998046875, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8555967211723328, "num_tokens": 269407734.0, "step": 7062 }, { "epoch": 0.8984861976847729, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.39811325073242, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8348427414894104, "num_tokens": 269444042.0, "step": 7063 }, { "epoch": 0.8986134079633634, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.51082992553711, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8792970776557922, "num_tokens": 269480979.0, "step": 7064 }, { "epoch": 0.898740618241954, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.65908432006836, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8565817475318909, "num_tokens": 269515594.0, "step": 7065 }, { "epoch": 0.8988678285205445, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.88343048095703, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8592232465744019, "num_tokens": 269556974.0, "step": 7066 }, { "epoch": 0.898995038799135, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.44486618041992, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8498852252960205, "num_tokens": 269591088.0, "step": 7067 }, { "epoch": 0.8991222490777255, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.85502243041992, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8655377626419067, "num_tokens": 269627921.0, "step": 7068 }, { "epoch": 0.899249459356316, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.506404876708984, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8403459191322327, "num_tokens": 269666586.0, "step": 7069 }, { "epoch": 0.8993766696349065, "ewc_loss": 0.09375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.54658126831055, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8600108623504639, "num_tokens": 269701205.0, "step": 7070 }, { "epoch": 0.899503879913497, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.71774673461914, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8695511817932129, "num_tokens": 269739982.0, "step": 7071 }, { "epoch": 0.8996310901920875, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.689476013183594, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8593682050704956, "num_tokens": 269773048.0, "step": 7072 }, { "epoch": 0.8997583004706781, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.68980407714844, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8567471504211426, "num_tokens": 269811343.0, "step": 7073 }, { "epoch": 0.8998855107492686, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.91786575317383, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8614861369132996, "num_tokens": 269851601.0, "step": 7074 }, { "epoch": 0.900012721027859, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.44675064086914, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8653781414031982, "num_tokens": 269891396.0, "step": 7075 }, { "epoch": 0.9001399313064495, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.77987289428711, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8560088872909546, "num_tokens": 269930292.0, "step": 7076 }, { "epoch": 0.9002671415850401, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.54377365112305, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8299087285995483, "num_tokens": 269968464.0, "step": 7077 }, { "epoch": 0.9003943518636306, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.057926177978516, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8437627553939819, "num_tokens": 270011569.0, "step": 7078 }, { "epoch": 0.9005215621422211, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.597591400146484, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8522108793258667, "num_tokens": 270050608.0, "step": 7079 }, { "epoch": 0.9006487724208116, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.886348724365234, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.842313289642334, "num_tokens": 270096100.0, "step": 7080 }, { "epoch": 0.9007759826994021, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.15507507324219, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8671255111694336, "num_tokens": 270140915.0, "step": 7081 }, { "epoch": 0.9009031929779926, "ewc_loss": 0.09375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.97761154174805, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8575329780578613, "num_tokens": 270184233.0, "step": 7082 }, { "epoch": 0.9010304032565831, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.16608810424805, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8551781177520752, "num_tokens": 270223889.0, "step": 7083 }, { "epoch": 0.9011576135351737, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.750125885009766, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8614922761917114, "num_tokens": 270266357.0, "step": 7084 }, { "epoch": 0.9012848238137642, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.29642868041992, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8648006916046143, "num_tokens": 270298252.0, "step": 7085 }, { "epoch": 0.9014120340923547, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.647491455078125, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8499399423599243, "num_tokens": 270343187.0, "step": 7086 }, { "epoch": 0.9015392443709451, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.590614318847656, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8464412093162537, "num_tokens": 270385094.0, "step": 7087 }, { "epoch": 0.9016664546495357, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.54412078857422, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8445606231689453, "num_tokens": 270421772.0, "step": 7088 }, { "epoch": 0.9017936649281262, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.72821807861328, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8383651971817017, "num_tokens": 270465033.0, "step": 7089 }, { "epoch": 0.9019208752067167, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.462303161621094, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.87912517786026, "num_tokens": 270501370.0, "step": 7090 }, { "epoch": 0.9020480854853072, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.9044303894043, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8604626655578613, "num_tokens": 270543571.0, "step": 7091 }, { "epoch": 0.9021752957638978, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.51724624633789, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8668269515037537, "num_tokens": 270580635.0, "step": 7092 }, { "epoch": 0.9023025060424882, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.5742301940918, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8702598214149475, "num_tokens": 270616534.0, "step": 7093 }, { "epoch": 0.9024297163210787, "ewc_loss": 0.09375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.9308967590332, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8392603993415833, "num_tokens": 270653998.0, "step": 7094 }, { "epoch": 0.9025569265996692, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.57445526123047, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8558423519134521, "num_tokens": 270688023.0, "step": 7095 }, { "epoch": 0.9026841368782598, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.64240264892578, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8389108180999756, "num_tokens": 270721298.0, "step": 7096 }, { "epoch": 0.9028113471568503, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.14061737060547, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8620376586914062, "num_tokens": 270762182.0, "step": 7097 }, { "epoch": 0.9029385574354408, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.29454040527344, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8722800612449646, "num_tokens": 270799577.0, "step": 7098 }, { "epoch": 0.9030657677140312, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.278778076171875, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8580098152160645, "num_tokens": 270840218.0, "step": 7099 }, { "epoch": 0.9031929779926218, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 32.84682083129883, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8770579099655151, "num_tokens": 270872982.0, "step": 7100 }, { "epoch": 0.9033201882712123, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.60385513305664, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8619894981384277, "num_tokens": 270908381.0, "step": 7101 }, { "epoch": 0.9034473985498028, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.8095817565918, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8572436571121216, "num_tokens": 270949217.0, "step": 7102 }, { "epoch": 0.9035746088283934, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.60606384277344, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8401138782501221, "num_tokens": 270989667.0, "step": 7103 }, { "epoch": 0.9037018191069839, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.71784591674805, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8469836711883545, "num_tokens": 271030395.0, "step": 7104 }, { "epoch": 0.9038290293855743, "ewc_loss": 0.09375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.43122100830078, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8381310701370239, "num_tokens": 271068239.0, "step": 7105 }, { "epoch": 0.9039562396641648, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.93034744262695, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8503530621528625, "num_tokens": 271110294.0, "step": 7106 }, { "epoch": 0.9040834499427554, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.48688507080078, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.876270055770874, "num_tokens": 271143288.0, "step": 7107 }, { "epoch": 0.9042106602213459, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.87245559692383, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8643538355827332, "num_tokens": 271181183.0, "step": 7108 }, { "epoch": 0.9043378704999364, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.62531280517578, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8560292720794678, "num_tokens": 271216201.0, "step": 7109 }, { "epoch": 0.9044650807785269, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.65398025512695, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8642103672027588, "num_tokens": 271250158.0, "step": 7110 }, { "epoch": 0.9045922910571175, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.54638671875, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8691838979721069, "num_tokens": 271289994.0, "step": 7111 }, { "epoch": 0.9047195013357079, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.773651123046875, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8624507188796997, "num_tokens": 271325782.0, "step": 7112 }, { "epoch": 0.9048467116142984, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.61762619018555, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8443930745124817, "num_tokens": 271361604.0, "step": 7113 }, { "epoch": 0.9049739218928889, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.761783599853516, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8666718006134033, "num_tokens": 271398520.0, "step": 7114 }, { "epoch": 0.9051011321714795, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.79059600830078, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8519099950790405, "num_tokens": 271445138.0, "step": 7115 }, { "epoch": 0.90522834245007, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.0237922668457, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8609793186187744, "num_tokens": 271479147.0, "step": 7116 }, { "epoch": 0.9053555527286605, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.770179748535156, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8593199253082275, "num_tokens": 271512477.0, "step": 7117 }, { "epoch": 0.905482763007251, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.8702278137207, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.843352198600769, "num_tokens": 271552355.0, "step": 7118 }, { "epoch": 0.9056099732858415, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.62940216064453, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8534678220748901, "num_tokens": 271595825.0, "step": 7119 }, { "epoch": 0.905737183564432, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.22051239013672, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8456840515136719, "num_tokens": 271627384.0, "step": 7120 }, { "epoch": 0.9058643938430225, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.501075744628906, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8663629293441772, "num_tokens": 271666711.0, "step": 7121 }, { "epoch": 0.9059916041216131, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.323524475097656, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8636969327926636, "num_tokens": 271703145.0, "step": 7122 }, { "epoch": 0.9061188144002036, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.63627243041992, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8462530374526978, "num_tokens": 271743131.0, "step": 7123 }, { "epoch": 0.906246024678794, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.48059844970703, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8615659475326538, "num_tokens": 271778245.0, "step": 7124 }, { "epoch": 0.9063732349573845, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.47255325317383, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8494377136230469, "num_tokens": 271817279.0, "step": 7125 }, { "epoch": 0.9065004452359751, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.922977447509766, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8652685284614563, "num_tokens": 271857136.0, "step": 7126 }, { "epoch": 0.9066276555145656, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.6972541809082, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8652769923210144, "num_tokens": 271891651.0, "step": 7127 }, { "epoch": 0.9067548657931561, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.673912048339844, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8620192408561707, "num_tokens": 271925805.0, "step": 7128 }, { "epoch": 0.9068820760717466, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.97402572631836, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8614866733551025, "num_tokens": 271964954.0, "step": 7129 }, { "epoch": 0.9070092863503371, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.5056266784668, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8570342063903809, "num_tokens": 272000331.0, "step": 7130 }, { "epoch": 0.9071364966289276, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.00410842895508, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8666512966156006, "num_tokens": 272037853.0, "step": 7131 }, { "epoch": 0.9072637069075181, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.57802200317383, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8674135804176331, "num_tokens": 272077663.0, "step": 7132 }, { "epoch": 0.9073909171861086, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.03807830810547, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8518685698509216, "num_tokens": 272116942.0, "step": 7133 }, { "epoch": 0.9075181274646992, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.71965408325195, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8515126705169678, "num_tokens": 272151319.0, "step": 7134 }, { "epoch": 0.9076453377432897, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.97959899902344, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8441735506057739, "num_tokens": 272190817.0, "step": 7135 }, { "epoch": 0.9077725480218801, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.90126037597656, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8509299755096436, "num_tokens": 272230461.0, "step": 7136 }, { "epoch": 0.9078997583004706, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.84711456298828, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8517645001411438, "num_tokens": 272268510.0, "step": 7137 }, { "epoch": 0.9080269685790612, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.87291717529297, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8570778369903564, "num_tokens": 272312430.0, "step": 7138 }, { "epoch": 0.9081541788576517, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.815006256103516, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8553146123886108, "num_tokens": 272348437.0, "step": 7139 }, { "epoch": 0.9082813891362422, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.902889251708984, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8501367568969727, "num_tokens": 272378902.0, "step": 7140 }, { "epoch": 0.9084085994148328, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.63005065917969, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8697333335876465, "num_tokens": 272421257.0, "step": 7141 }, { "epoch": 0.9085358096934232, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.9252815246582, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8560171723365784, "num_tokens": 272459800.0, "step": 7142 }, { "epoch": 0.9086630199720137, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.490352630615234, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.875759482383728, "num_tokens": 272501245.0, "step": 7143 }, { "epoch": 0.9087902302506042, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.3354606628418, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8588769435882568, "num_tokens": 272535873.0, "step": 7144 }, { "epoch": 0.9089174405291948, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.648704528808594, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8544549942016602, "num_tokens": 272575213.0, "step": 7145 }, { "epoch": 0.9090446508077853, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.71111297607422, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8598012924194336, "num_tokens": 272611960.0, "step": 7146 }, { "epoch": 0.9091718610863758, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.179264068603516, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8577796220779419, "num_tokens": 272650895.0, "step": 7147 }, { "epoch": 0.9092990713649662, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.653167724609375, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.857595682144165, "num_tokens": 272690617.0, "step": 7148 }, { "epoch": 0.9094262816435568, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.917240142822266, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8489673137664795, "num_tokens": 272732526.0, "step": 7149 }, { "epoch": 0.9095534919221473, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.79360580444336, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8557308912277222, "num_tokens": 272769882.0, "step": 7150 }, { "epoch": 0.9096807022007378, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.91449737548828, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8602049946784973, "num_tokens": 272809614.0, "step": 7151 }, { "epoch": 0.9098079124793284, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.91050720214844, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8501777648925781, "num_tokens": 272844819.0, "step": 7152 }, { "epoch": 0.9099351227579189, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.821189880371094, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8635714650154114, "num_tokens": 272882798.0, "step": 7153 }, { "epoch": 0.9100623330365093, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.95500183105469, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.859316349029541, "num_tokens": 272917472.0, "step": 7154 }, { "epoch": 0.9101895433150998, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.83237838745117, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8463163375854492, "num_tokens": 272959947.0, "step": 7155 }, { "epoch": 0.9103167535936904, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.835689544677734, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8401762247085571, "num_tokens": 272998194.0, "step": 7156 }, { "epoch": 0.9104439638722809, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.811771392822266, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8746039867401123, "num_tokens": 273034688.0, "step": 7157 }, { "epoch": 0.9105711741508714, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.792415618896484, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8692725300788879, "num_tokens": 273071575.0, "step": 7158 }, { "epoch": 0.9106983844294619, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.846614837646484, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8322853446006775, "num_tokens": 273106106.0, "step": 7159 }, { "epoch": 0.9108255947080524, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.90263748168945, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8523777723312378, "num_tokens": 273142959.0, "step": 7160 }, { "epoch": 0.9109528049866429, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.69533920288086, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8433555364608765, "num_tokens": 273182781.0, "step": 7161 }, { "epoch": 0.9110800152652334, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.8906135559082, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8643325567245483, "num_tokens": 273225738.0, "step": 7162 }, { "epoch": 0.9112072255438239, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.73903274536133, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8652355074882507, "num_tokens": 273269166.0, "step": 7163 }, { "epoch": 0.9113344358224145, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.050838470458984, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8523696064949036, "num_tokens": 273309093.0, "step": 7164 }, { "epoch": 0.911461646101005, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.78575897216797, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8623897433280945, "num_tokens": 273346857.0, "step": 7165 }, { "epoch": 0.9115888563795955, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.702064514160156, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8532490730285645, "num_tokens": 273385720.0, "step": 7166 }, { "epoch": 0.9117160666581859, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.651275634765625, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8577338457107544, "num_tokens": 273431079.0, "step": 7167 }, { "epoch": 0.9118432769367765, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.2564582824707, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8588976860046387, "num_tokens": 273477316.0, "step": 7168 }, { "epoch": 0.911970487215367, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.74254608154297, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8600198030471802, "num_tokens": 273512831.0, "step": 7169 }, { "epoch": 0.9120976974939575, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.1874885559082, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8531399369239807, "num_tokens": 273551927.0, "step": 7170 }, { "epoch": 0.912224907772548, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.10136795043945, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8598440885543823, "num_tokens": 273583053.0, "step": 7171 }, { "epoch": 0.9123521180511386, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.7685661315918, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8474572896957397, "num_tokens": 273617953.0, "step": 7172 }, { "epoch": 0.912479328329729, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.10736083984375, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8423654437065125, "num_tokens": 273649959.0, "step": 7173 }, { "epoch": 0.9126065386083195, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.3753776550293, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8475208282470703, "num_tokens": 273682666.0, "step": 7174 }, { "epoch": 0.9127337488869101, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.74661636352539, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8742769956588745, "num_tokens": 273718641.0, "step": 7175 }, { "epoch": 0.9128609591655006, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.4753532409668, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8679649829864502, "num_tokens": 273760502.0, "step": 7176 }, { "epoch": 0.9129881694440911, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.78797912597656, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8510360717773438, "num_tokens": 273795068.0, "step": 7177 }, { "epoch": 0.9131153797226816, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.6486930847168, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8391125202178955, "num_tokens": 273825099.0, "step": 7178 }, { "epoch": 0.9132425900012721, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.356624603271484, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8638327121734619, "num_tokens": 273862566.0, "step": 7179 }, { "epoch": 0.9133698002798626, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.12507247924805, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8574143648147583, "num_tokens": 273896107.0, "step": 7180 }, { "epoch": 0.9134970105584531, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.256256103515625, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8628520965576172, "num_tokens": 273934066.0, "step": 7181 }, { "epoch": 0.9136242208370436, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.71294021606445, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8564895987510681, "num_tokens": 273971843.0, "step": 7182 }, { "epoch": 0.9137514311156342, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.09785842895508, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8504855036735535, "num_tokens": 274001646.0, "step": 7183 }, { "epoch": 0.9138786413942247, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.99678039550781, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8415684700012207, "num_tokens": 274042656.0, "step": 7184 }, { "epoch": 0.9140058516728151, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.00508499145508, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8439177870750427, "num_tokens": 274084609.0, "step": 7185 }, { "epoch": 0.9141330619514056, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.08979797363281, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8598612546920776, "num_tokens": 274127503.0, "step": 7186 }, { "epoch": 0.9142602722299962, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.64828109741211, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8623462915420532, "num_tokens": 274164126.0, "step": 7187 }, { "epoch": 0.9143874825085867, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.87290573120117, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8572919368743896, "num_tokens": 274198104.0, "step": 7188 }, { "epoch": 0.9145146927871772, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.933719635009766, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8472813367843628, "num_tokens": 274233487.0, "step": 7189 }, { "epoch": 0.9146419030657678, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.74323272705078, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8591326475143433, "num_tokens": 274267317.0, "step": 7190 }, { "epoch": 0.9147691133443582, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.087249755859375, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8503782749176025, "num_tokens": 274304169.0, "step": 7191 }, { "epoch": 0.9148963236229487, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.634151458740234, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8587884902954102, "num_tokens": 274343265.0, "step": 7192 }, { "epoch": 0.9150235339015392, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.09244155883789, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8401023745536804, "num_tokens": 274380756.0, "step": 7193 }, { "epoch": 0.9151507441801298, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 34.0206413269043, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8675055503845215, "num_tokens": 274422035.0, "step": 7194 }, { "epoch": 0.9152779544587203, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 34.037940979003906, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8475464582443237, "num_tokens": 274460875.0, "step": 7195 }, { "epoch": 0.9154051647373108, "ewc_loss": 0.09228515625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.534027099609375e-05, "grad_norm": 32.96595001220703, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.858849287033081, "num_tokens": 274495807.0, "step": 7196 }, { "epoch": 0.9155323750159012, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.443199157714844, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8656548261642456, "num_tokens": 274533098.0, "step": 7197 }, { "epoch": 0.9156595852944918, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.934112548828125, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.851867139339447, "num_tokens": 274565282.0, "step": 7198 }, { "epoch": 0.9157867955730823, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.22160720825195, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8606647253036499, "num_tokens": 274605401.0, "step": 7199 }, { "epoch": 0.9159140058516728, "ewc_loss": 0.09375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.398048400878906, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8508996367454529, "num_tokens": 274645018.0, "step": 7200 }, { "epoch": 0.9160412161302633, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.06053924560547, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8669174909591675, "num_tokens": 274681541.0, "step": 7201 }, { "epoch": 0.9161684264088539, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.69231414794922, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.860163152217865, "num_tokens": 274716883.0, "step": 7202 }, { "epoch": 0.9162956366874443, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.811344146728516, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8508981466293335, "num_tokens": 274753038.0, "step": 7203 }, { "epoch": 0.9164228469660348, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.63969039916992, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8629241585731506, "num_tokens": 274790760.0, "step": 7204 }, { "epoch": 0.9165500572446253, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.26125717163086, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.849550724029541, "num_tokens": 274830872.0, "step": 7205 }, { "epoch": 0.9166772675232159, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.60767364501953, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8434363603591919, "num_tokens": 274869832.0, "step": 7206 }, { "epoch": 0.9168044778018064, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.25072479248047, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.849179744720459, "num_tokens": 274903853.0, "step": 7207 }, { "epoch": 0.9169316880803969, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.897491455078125, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.855513334274292, "num_tokens": 274942911.0, "step": 7208 }, { "epoch": 0.9170588983589874, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.03166198730469, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.864275336265564, "num_tokens": 274986427.0, "step": 7209 }, { "epoch": 0.9171861086375779, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.981361389160156, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8587490916252136, "num_tokens": 275021557.0, "step": 7210 }, { "epoch": 0.9173133189161684, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.922542572021484, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8451045751571655, "num_tokens": 275061308.0, "step": 7211 }, { "epoch": 0.9174405291947589, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.25508499145508, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8508350849151611, "num_tokens": 275101497.0, "step": 7212 }, { "epoch": 0.9175677394733495, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.897891998291016, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8440797924995422, "num_tokens": 275138317.0, "step": 7213 }, { "epoch": 0.91769494975194, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.18639373779297, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8563503623008728, "num_tokens": 275180119.0, "step": 7214 }, { "epoch": 0.9178221600305305, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.7865104675293, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8551636934280396, "num_tokens": 275212083.0, "step": 7215 }, { "epoch": 0.9179493703091209, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.93616485595703, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8447028398513794, "num_tokens": 275247872.0, "step": 7216 }, { "epoch": 0.9180765805877115, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.02445983886719, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8611733913421631, "num_tokens": 275287022.0, "step": 7217 }, { "epoch": 0.918203790866302, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.96903610229492, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8578059077262878, "num_tokens": 275322259.0, "step": 7218 }, { "epoch": 0.9183310011448925, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.966129302978516, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8704617023468018, "num_tokens": 275357410.0, "step": 7219 }, { "epoch": 0.918458211423483, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.110435485839844, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8689827919006348, "num_tokens": 275388720.0, "step": 7220 }, { "epoch": 0.9185854217020736, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.33190155029297, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8630746603012085, "num_tokens": 275427699.0, "step": 7221 }, { "epoch": 0.918712631980664, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.26969909667969, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8598837852478027, "num_tokens": 275472220.0, "step": 7222 }, { "epoch": 0.9188398422592545, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.202091217041016, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8452337980270386, "num_tokens": 275506693.0, "step": 7223 }, { "epoch": 0.918967052537845, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.1022834777832, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8590468168258667, "num_tokens": 275547295.0, "step": 7224 }, { "epoch": 0.9190942628164356, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.88915252685547, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8525959253311157, "num_tokens": 275591552.0, "step": 7225 }, { "epoch": 0.9192214730950261, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.3636474609375, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8322896361351013, "num_tokens": 275632197.0, "step": 7226 }, { "epoch": 0.9193486833736166, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.95968246459961, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8492192029953003, "num_tokens": 275670680.0, "step": 7227 }, { "epoch": 0.919475893652207, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.155120849609375, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8543487787246704, "num_tokens": 275709315.0, "step": 7228 }, { "epoch": 0.9196031039307976, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.62633514404297, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8550254106521606, "num_tokens": 275740405.0, "step": 7229 }, { "epoch": 0.9197303142093881, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.33856201171875, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.874672532081604, "num_tokens": 275780915.0, "step": 7230 }, { "epoch": 0.9198575244879786, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 32.93821334838867, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8504005670547485, "num_tokens": 275817832.0, "step": 7231 }, { "epoch": 0.9199847347665692, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.25499725341797, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8602691888809204, "num_tokens": 275857621.0, "step": 7232 }, { "epoch": 0.9201119450451597, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.839012145996094, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8538373708724976, "num_tokens": 275898917.0, "step": 7233 }, { "epoch": 0.9202391553237501, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.00080490112305, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8601521849632263, "num_tokens": 275930151.0, "step": 7234 }, { "epoch": 0.9203663656023406, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.201011657714844, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8637551665306091, "num_tokens": 275966901.0, "step": 7235 }, { "epoch": 0.9204935758809312, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.969512939453125, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8789293766021729, "num_tokens": 276002845.0, "step": 7236 }, { "epoch": 0.9206207861595217, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.188209533691406, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8546555638313293, "num_tokens": 276038476.0, "step": 7237 }, { "epoch": 0.9207479964381122, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.98479461669922, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.852332592010498, "num_tokens": 276077867.0, "step": 7238 }, { "epoch": 0.9208752067167028, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.312034606933594, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8459385633468628, "num_tokens": 276123289.0, "step": 7239 }, { "epoch": 0.9210024169952932, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.78483200073242, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8426162004470825, "num_tokens": 276157357.0, "step": 7240 }, { "epoch": 0.9211296272738837, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.85310363769531, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8771304488182068, "num_tokens": 276195548.0, "step": 7241 }, { "epoch": 0.9212568375524742, "ewc_loss": 0.0927734375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.581710815429688e-05, "grad_norm": 33.01203155517578, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8585706949234009, "num_tokens": 276233533.0, "step": 7242 }, { "epoch": 0.9213840478310648, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.603450775146484, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8589096069335938, "num_tokens": 276274310.0, "step": 7243 }, { "epoch": 0.9215112581096553, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.15415954589844, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8547326326370239, "num_tokens": 276310272.0, "step": 7244 }, { "epoch": 0.9216384683882458, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.07275390625, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8513495922088623, "num_tokens": 276350334.0, "step": 7245 }, { "epoch": 0.9217656786668362, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.47214889526367, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.851665735244751, "num_tokens": 276393913.0, "step": 7246 }, { "epoch": 0.9218928889454268, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.23722839355469, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8531651496887207, "num_tokens": 276430368.0, "step": 7247 }, { "epoch": 0.9220200992240173, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.12647247314453, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8505983352661133, "num_tokens": 276470930.0, "step": 7248 }, { "epoch": 0.9221473095026078, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 32.91719436645508, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8604938983917236, "num_tokens": 276509359.0, "step": 7249 }, { "epoch": 0.9222745197811983, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.20891189575195, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8471819162368774, "num_tokens": 276546980.0, "step": 7250 }, { "epoch": 0.9224017300597889, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.15895462036133, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8442712426185608, "num_tokens": 276586212.0, "step": 7251 }, { "epoch": 0.9225289403383793, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.171573638916016, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8585675954818726, "num_tokens": 276623946.0, "step": 7252 }, { "epoch": 0.9226561506169698, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.85985565185547, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8508211374282837, "num_tokens": 276667567.0, "step": 7253 }, { "epoch": 0.9227833608955603, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.29924774169922, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8498351573944092, "num_tokens": 276708479.0, "step": 7254 }, { "epoch": 0.9229105711741509, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.3345947265625, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8512575626373291, "num_tokens": 276744627.0, "step": 7255 }, { "epoch": 0.9230377814527414, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.167232513427734, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.856248140335083, "num_tokens": 276782830.0, "step": 7256 }, { "epoch": 0.9231649917313319, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.04694366455078, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8543135523796082, "num_tokens": 276825745.0, "step": 7257 }, { "epoch": 0.9232922020099223, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.0937614440918, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8622139096260071, "num_tokens": 276866743.0, "step": 7258 }, { "epoch": 0.9234194122885129, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.348079681396484, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8509466648101807, "num_tokens": 276903884.0, "step": 7259 }, { "epoch": 0.9235466225671034, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.177032470703125, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8613046407699585, "num_tokens": 276939142.0, "step": 7260 }, { "epoch": 0.9236738328456939, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.11643600463867, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.856989860534668, "num_tokens": 276979041.0, "step": 7261 }, { "epoch": 0.9238010431242845, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.32014465332031, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8691018223762512, "num_tokens": 277015590.0, "step": 7262 }, { "epoch": 0.923928253402875, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.96623992919922, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8481504917144775, "num_tokens": 277055745.0, "step": 7263 }, { "epoch": 0.9240554636814655, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.15598678588867, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8507533669471741, "num_tokens": 277098856.0, "step": 7264 }, { "epoch": 0.9241826739600559, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.26797866821289, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8672207593917847, "num_tokens": 277139165.0, "step": 7265 }, { "epoch": 0.9243098842386465, "ewc_loss": 0.09375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 32.96556091308594, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8605988025665283, "num_tokens": 277171879.0, "step": 7266 }, { "epoch": 0.924437094517237, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.20482635498047, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8576065897941589, "num_tokens": 277211719.0, "step": 7267 }, { "epoch": 0.9245643047958275, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.23428726196289, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8475750088691711, "num_tokens": 277252675.0, "step": 7268 }, { "epoch": 0.924691515074418, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.90896224975586, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8447601795196533, "num_tokens": 277287529.0, "step": 7269 }, { "epoch": 0.9248187253530086, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.34043884277344, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8558733463287354, "num_tokens": 277328392.0, "step": 7270 }, { "epoch": 0.924945935631599, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.97771072387695, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8664031028747559, "num_tokens": 277368412.0, "step": 7271 }, { "epoch": 0.9250731459101895, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.13262939453125, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.862350583076477, "num_tokens": 277404373.0, "step": 7272 }, { "epoch": 0.92520035618878, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.062042236328125, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8531713485717773, "num_tokens": 277444988.0, "step": 7273 }, { "epoch": 0.9253275664673706, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.408782958984375, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8653520345687866, "num_tokens": 277487501.0, "step": 7274 }, { "epoch": 0.9254547767459611, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.15379333496094, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8520646095275879, "num_tokens": 277529056.0, "step": 7275 }, { "epoch": 0.9255819870245516, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 32.86039352416992, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8669437170028687, "num_tokens": 277563022.0, "step": 7276 }, { "epoch": 0.925709197303142, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.23475646972656, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8636019229888916, "num_tokens": 277603879.0, "step": 7277 }, { "epoch": 0.9258364075817326, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.007301330566406, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8519953489303589, "num_tokens": 277643711.0, "step": 7278 }, { "epoch": 0.9259636178603231, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.550819396972656, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8562412261962891, "num_tokens": 277680667.0, "step": 7279 }, { "epoch": 0.9260908281389136, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.01325225830078, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8539997935295105, "num_tokens": 277715548.0, "step": 7280 }, { "epoch": 0.9262180384175042, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.20063781738281, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8559756278991699, "num_tokens": 277745287.0, "step": 7281 }, { "epoch": 0.9263452486960947, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.987876892089844, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8503394722938538, "num_tokens": 277781484.0, "step": 7282 }, { "epoch": 0.9264724589746851, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.265804290771484, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.858834445476532, "num_tokens": 277818429.0, "step": 7283 }, { "epoch": 0.9265996692532756, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 32.944950103759766, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8646895885467529, "num_tokens": 277853077.0, "step": 7284 }, { "epoch": 0.9267268795318662, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.23228454589844, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8751919865608215, "num_tokens": 277893997.0, "step": 7285 }, { "epoch": 0.9268540898104567, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.01891326904297, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8457956314086914, "num_tokens": 277934680.0, "step": 7286 }, { "epoch": 0.9269813000890472, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.45329284667969, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8488630652427673, "num_tokens": 277980370.0, "step": 7287 }, { "epoch": 0.9271085103676378, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.09120178222656, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8684658408164978, "num_tokens": 278017257.0, "step": 7288 }, { "epoch": 0.9272357206462282, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.78053283691406, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8560497760772705, "num_tokens": 278054835.0, "step": 7289 }, { "epoch": 0.9273629309248187, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.96638488769531, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8544260263442993, "num_tokens": 278089510.0, "step": 7290 }, { "epoch": 0.9274901412034092, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.387840270996094, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8582958579063416, "num_tokens": 278126179.0, "step": 7291 }, { "epoch": 0.9276173514819998, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.03794860839844, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8337186574935913, "num_tokens": 278170837.0, "step": 7292 }, { "epoch": 0.9277445617605903, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.09247589111328, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8609779477119446, "num_tokens": 278207410.0, "step": 7293 }, { "epoch": 0.9278717720391808, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.860565185546875, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8763916492462158, "num_tokens": 278240495.0, "step": 7294 }, { "epoch": 0.9279989823177712, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.40093994140625, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8481444716453552, "num_tokens": 278278449.0, "step": 7295 }, { "epoch": 0.9281261925963618, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.93693161010742, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.866037130355835, "num_tokens": 278321434.0, "step": 7296 }, { "epoch": 0.9282534028749523, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 32.896324157714844, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8510288000106812, "num_tokens": 278363730.0, "step": 7297 }, { "epoch": 0.9283806131535428, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.51789093017578, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8573128581047058, "num_tokens": 278395493.0, "step": 7298 }, { "epoch": 0.9285078234321333, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.2037353515625, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8594239950180054, "num_tokens": 278433024.0, "step": 7299 }, { "epoch": 0.9286350337107239, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 32.98420333862305, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8568000793457031, "num_tokens": 278471311.0, "step": 7300 }, { "epoch": 0.9287622439893143, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.624237060546875, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8764027953147888, "num_tokens": 278509151.0, "step": 7301 }, { "epoch": 0.9288894542679048, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 32.763980865478516, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8635362386703491, "num_tokens": 278548743.0, "step": 7302 }, { "epoch": 0.9290166645464953, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.67576217651367, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8680797815322876, "num_tokens": 278590722.0, "step": 7303 }, { "epoch": 0.9291438748250859, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.963836669921875, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8436710834503174, "num_tokens": 278628150.0, "step": 7304 }, { "epoch": 0.9292710851036764, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.61296081542969, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.84214186668396, "num_tokens": 278672543.0, "step": 7305 }, { "epoch": 0.9293982953822669, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.41100311279297, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8651058673858643, "num_tokens": 278707234.0, "step": 7306 }, { "epoch": 0.9295255056608573, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.572078704833984, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.857552170753479, "num_tokens": 278734678.0, "step": 7307 }, { "epoch": 0.9296527159394479, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.48076248168945, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8658394813537598, "num_tokens": 278778284.0, "step": 7308 }, { "epoch": 0.9297799262180384, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.454280853271484, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8556019067764282, "num_tokens": 278815363.0, "step": 7309 }, { "epoch": 0.9299071364966289, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.69313049316406, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8439041376113892, "num_tokens": 278860777.0, "step": 7310 }, { "epoch": 0.9300343467752195, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.31573486328125, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8714984655380249, "num_tokens": 278890552.0, "step": 7311 }, { "epoch": 0.93016155705381, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.61509704589844, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8434814810752869, "num_tokens": 278924741.0, "step": 7312 }, { "epoch": 0.9302887673324005, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.393402099609375, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8658996820449829, "num_tokens": 278965185.0, "step": 7313 }, { "epoch": 0.9304159776109909, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.805450439453125, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8470454216003418, "num_tokens": 278998961.0, "step": 7314 }, { "epoch": 0.9305431878895815, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.24243927001953, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8450742959976196, "num_tokens": 279032689.0, "step": 7315 }, { "epoch": 0.930670398168172, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.564517974853516, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8443183302879333, "num_tokens": 279080305.0, "step": 7316 }, { "epoch": 0.9307976084467625, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.54151916503906, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8566368818283081, "num_tokens": 279120581.0, "step": 7317 }, { "epoch": 0.930924818725353, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.36427688598633, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8693578243255615, "num_tokens": 279165050.0, "step": 7318 }, { "epoch": 0.9310520290039436, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.472774505615234, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8617566823959351, "num_tokens": 279209393.0, "step": 7319 }, { "epoch": 0.931179239282534, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.55130386352539, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8431029319763184, "num_tokens": 279250569.0, "step": 7320 }, { "epoch": 0.9313064495611245, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.38946533203125, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8625176548957825, "num_tokens": 279291154.0, "step": 7321 }, { "epoch": 0.931433659839715, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.82624435424805, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8565991520881653, "num_tokens": 279326967.0, "step": 7322 }, { "epoch": 0.9315608701183056, "ewc_loss": 0.09326171875, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.62939453125e-05, "grad_norm": 33.37572479248047, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8697276711463928, "num_tokens": 279367129.0, "step": 7323 }, { "epoch": 0.9316880803968961, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.518802642822266, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8627313375473022, "num_tokens": 279403651.0, "step": 7324 }, { "epoch": 0.9318152906754866, "ewc_loss": 0.09375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.94887924194336, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8488618731498718, "num_tokens": 279439166.0, "step": 7325 }, { "epoch": 0.931942500954077, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.20394515991211, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8540352582931519, "num_tokens": 279473489.0, "step": 7326 }, { "epoch": 0.9320697112326676, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 34.254966735839844, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8609979152679443, "num_tokens": 279511709.0, "step": 7327 }, { "epoch": 0.9321969215112581, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.231468200683594, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8458879590034485, "num_tokens": 279551638.0, "step": 7328 }, { "epoch": 0.9323241317898486, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 34.042484283447266, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8669177293777466, "num_tokens": 279587304.0, "step": 7329 }, { "epoch": 0.9324513420684392, "ewc_loss": 0.09375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.54325866699219, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8483850955963135, "num_tokens": 279628138.0, "step": 7330 }, { "epoch": 0.9325785523470297, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.880558013916016, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.860183596611023, "num_tokens": 279663064.0, "step": 7331 }, { "epoch": 0.9327057626256201, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.06664276123047, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8482494354248047, "num_tokens": 279704937.0, "step": 7332 }, { "epoch": 0.9328329729042106, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.858280181884766, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8602995872497559, "num_tokens": 279738215.0, "step": 7333 }, { "epoch": 0.9329601831828012, "ewc_loss": 0.09423828125, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.369956970214844, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8396259546279907, "num_tokens": 279771996.0, "step": 7334 }, { "epoch": 0.9330873934613917, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.55707550048828, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.871685266494751, "num_tokens": 279810037.0, "step": 7335 }, { "epoch": 0.9332146037399822, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.73872756958008, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.84977126121521, "num_tokens": 279847394.0, "step": 7336 }, { "epoch": 0.9333418140185727, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.165687561035156, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8424415588378906, "num_tokens": 279879057.0, "step": 7337 }, { "epoch": 0.9334690242971632, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 34.066009521484375, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8492393493652344, "num_tokens": 279911775.0, "step": 7338 }, { "epoch": 0.9335962345757537, "ewc_loss": 0.09375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.386085510253906, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8588696122169495, "num_tokens": 279947661.0, "step": 7339 }, { "epoch": 0.9337234448543442, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.47593307495117, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8586444854736328, "num_tokens": 279977507.0, "step": 7340 }, { "epoch": 0.9338506551329347, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.70243453979492, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8525961637496948, "num_tokens": 280016749.0, "step": 7341 }, { "epoch": 0.9339778654115253, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.43269348144531, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8592713475227356, "num_tokens": 280056092.0, "step": 7342 }, { "epoch": 0.9341050756901158, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.58137130737305, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8695805072784424, "num_tokens": 280089803.0, "step": 7343 }, { "epoch": 0.9342322859687062, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.49474334716797, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8449567556381226, "num_tokens": 280126190.0, "step": 7344 }, { "epoch": 0.9343594962472968, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.70296859741211, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8687281012535095, "num_tokens": 280162743.0, "step": 7345 }, { "epoch": 0.9344867065258873, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.59486770629883, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8566518425941467, "num_tokens": 280203676.0, "step": 7346 }, { "epoch": 0.9346139168044778, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.391056060791016, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.874303936958313, "num_tokens": 280241765.0, "step": 7347 }, { "epoch": 0.9347411270830683, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.55789566040039, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8364579677581787, "num_tokens": 280280103.0, "step": 7348 }, { "epoch": 0.9348683373616589, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.454673767089844, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8394574522972107, "num_tokens": 280314674.0, "step": 7349 }, { "epoch": 0.9349955476402493, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.480369567871094, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8588672876358032, "num_tokens": 280353890.0, "step": 7350 }, { "epoch": 0.9351227579188398, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.53690719604492, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8492487668991089, "num_tokens": 280396403.0, "step": 7351 }, { "epoch": 0.9352499681974303, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.036705017089844, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.856899619102478, "num_tokens": 280436154.0, "step": 7352 }, { "epoch": 0.9353771784760209, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.57593536376953, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8643292188644409, "num_tokens": 280476480.0, "step": 7353 }, { "epoch": 0.9355043887546114, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.51129150390625, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8609085083007812, "num_tokens": 280505976.0, "step": 7354 }, { "epoch": 0.9356315990332019, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.55804443359375, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8525723814964294, "num_tokens": 280548365.0, "step": 7355 }, { "epoch": 0.9357588093117923, "ewc_loss": 0.09375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.16799545288086, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8751012682914734, "num_tokens": 280588074.0, "step": 7356 }, { "epoch": 0.9358860195903829, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 34.0408821105957, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8671271800994873, "num_tokens": 280626915.0, "step": 7357 }, { "epoch": 0.9360132298689734, "ewc_loss": 0.09375, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.677078247070312e-05, "grad_norm": 33.21477127075195, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8513517379760742, "num_tokens": 280660088.0, "step": 7358 }, { "epoch": 0.9361404401475639, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.763641357421875, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.855853259563446, "num_tokens": 280701326.0, "step": 7359 }, { "epoch": 0.9362676504261545, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.724761962890625e-05, "grad_norm": 33.051456451416016, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8675299286842346, "num_tokens": 280740256.0, "step": 7360 }, { "epoch": 0.936394860704745, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.54767608642578, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8698522448539734, "num_tokens": 280773515.0, "step": 7361 }, { "epoch": 0.9365220709833355, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.44306182861328, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8330618143081665, "num_tokens": 280817207.0, "step": 7362 }, { "epoch": 0.9366492812619259, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.2042236328125, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8446446061134338, "num_tokens": 280860253.0, "step": 7363 }, { "epoch": 0.9367764915405165, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.765625, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8521938323974609, "num_tokens": 280892634.0, "step": 7364 }, { "epoch": 0.936903701819107, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 32.71809768676758, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8653664588928223, "num_tokens": 280930912.0, "step": 7365 }, { "epoch": 0.9370309120976975, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.922088623046875, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8604954481124878, "num_tokens": 280971588.0, "step": 7366 }, { "epoch": 0.937158122376288, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 32.946083068847656, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8663502931594849, "num_tokens": 281009898.0, "step": 7367 }, { "epoch": 0.9372853326548786, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.88858413696289, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8493006825447083, "num_tokens": 281046726.0, "step": 7368 }, { "epoch": 0.937412542933469, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.42417526245117, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8702467083930969, "num_tokens": 281085163.0, "step": 7369 }, { "epoch": 0.9375397532120595, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.30305480957031, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8643271923065186, "num_tokens": 281128660.0, "step": 7370 }, { "epoch": 0.93766696349065, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.68890380859375, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8381221294403076, "num_tokens": 281163780.0, "step": 7371 }, { "epoch": 0.9377941737692406, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.164268493652344, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8647337555885315, "num_tokens": 281204247.0, "step": 7372 }, { "epoch": 0.9379213840478311, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.851051330566406, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.881411075592041, "num_tokens": 281239518.0, "step": 7373 }, { "epoch": 0.9380485943264216, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.177730560302734, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8652806878089905, "num_tokens": 281275916.0, "step": 7374 }, { "epoch": 0.938175804605012, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.73097610473633, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8665544986724854, "num_tokens": 281307615.0, "step": 7375 }, { "epoch": 0.9383030148836026, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.54416275024414, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.862362802028656, "num_tokens": 281342677.0, "step": 7376 }, { "epoch": 0.9384302251621931, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.64680099487305, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8535579442977905, "num_tokens": 281379545.0, "step": 7377 }, { "epoch": 0.9385574354407836, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.1750373840332, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.839271068572998, "num_tokens": 281424169.0, "step": 7378 }, { "epoch": 0.9386846457193742, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.555904388427734, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8770434856414795, "num_tokens": 281461306.0, "step": 7379 }, { "epoch": 0.9388118559979647, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.49586868286133, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.870337724685669, "num_tokens": 281498753.0, "step": 7380 }, { "epoch": 0.9389390662765551, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.38352966308594, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8665711283683777, "num_tokens": 281537163.0, "step": 7381 }, { "epoch": 0.9390662765551456, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.250732421875, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8484574556350708, "num_tokens": 281577926.0, "step": 7382 }, { "epoch": 0.9391934868337362, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.58037185668945, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8768713474273682, "num_tokens": 281621283.0, "step": 7383 }, { "epoch": 0.9393206971123267, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.576934814453125, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8604419231414795, "num_tokens": 281658336.0, "step": 7384 }, { "epoch": 0.9394479073909172, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.45553207397461, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8607857823371887, "num_tokens": 281695137.0, "step": 7385 }, { "epoch": 0.9395751176695077, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.889225006103516, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8725461363792419, "num_tokens": 281733570.0, "step": 7386 }, { "epoch": 0.9397023279480982, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 32.91741943359375, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8601244688034058, "num_tokens": 281771163.0, "step": 7387 }, { "epoch": 0.9398295382266887, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 34.09914016723633, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8608169555664062, "num_tokens": 281813715.0, "step": 7388 }, { "epoch": 0.9399567485052792, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.22699737548828, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8554275631904602, "num_tokens": 281856921.0, "step": 7389 }, { "epoch": 0.9400839587838697, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.81809616088867, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8497262597084045, "num_tokens": 281893273.0, "step": 7390 }, { "epoch": 0.9402111690624603, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.02113723754883, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8630616664886475, "num_tokens": 281927452.0, "step": 7391 }, { "epoch": 0.9403383793410508, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.14152908325195, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8831465244293213, "num_tokens": 281965359.0, "step": 7392 }, { "epoch": 0.9404655896196412, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.05632019042969, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8541116714477539, "num_tokens": 281999209.0, "step": 7393 }, { "epoch": 0.9405927998982317, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.01845169067383, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8471043109893799, "num_tokens": 282042129.0, "step": 7394 }, { "epoch": 0.9407200101768223, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.590415954589844, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8492408990859985, "num_tokens": 282087941.0, "step": 7395 }, { "epoch": 0.9408472204554128, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.63584518432617, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.863644003868103, "num_tokens": 282123130.0, "step": 7396 }, { "epoch": 0.9409744307340033, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.186431884765625, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8528931140899658, "num_tokens": 282161142.0, "step": 7397 }, { "epoch": 0.9411016410125939, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.987030029296875, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8813768029212952, "num_tokens": 282196363.0, "step": 7398 }, { "epoch": 0.9412288512911843, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.47844314575195, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8382996916770935, "num_tokens": 282233202.0, "step": 7399 }, { "epoch": 0.9413560615697748, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.640869140625, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8581815361976624, "num_tokens": 282270976.0, "step": 7400 }, { "epoch": 0.9414832718483653, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.48792266845703, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8667945265769958, "num_tokens": 282312223.0, "step": 7401 }, { "epoch": 0.9416104821269559, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.71549987792969, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8408304452896118, "num_tokens": 282350418.0, "step": 7402 }, { "epoch": 0.9417376924055464, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.32293701171875, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8575373888015747, "num_tokens": 282380751.0, "step": 7403 }, { "epoch": 0.9418649026841369, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 34.0135498046875, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.868852972984314, "num_tokens": 282420248.0, "step": 7404 }, { "epoch": 0.9419921129627273, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.36184310913086, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.86280357837677, "num_tokens": 282457951.0, "step": 7405 }, { "epoch": 0.9421193232413179, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 34.07879638671875, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.857162594795227, "num_tokens": 282507379.0, "step": 7406 }, { "epoch": 0.9422465335199084, "ewc_loss": 0.0947265625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.518798828125, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8464120030403137, "num_tokens": 282546905.0, "step": 7407 }, { "epoch": 0.9423737437984989, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.929988861083984, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8291020393371582, "num_tokens": 282591368.0, "step": 7408 }, { "epoch": 0.9425009540770894, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.51506423950195, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8553446531295776, "num_tokens": 282617331.0, "step": 7409 }, { "epoch": 0.94262816435568, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 34.02751541137695, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8707358837127686, "num_tokens": 282652346.0, "step": 7410 }, { "epoch": 0.9427553746342705, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.41048812866211, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8617327809333801, "num_tokens": 282691564.0, "step": 7411 }, { "epoch": 0.9428825849128609, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.890869140625, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.867425799369812, "num_tokens": 282728186.0, "step": 7412 }, { "epoch": 0.9430097951914514, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.5200080871582, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8506987690925598, "num_tokens": 282770491.0, "step": 7413 }, { "epoch": 0.943137005470042, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.617340087890625, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8503504991531372, "num_tokens": 282809824.0, "step": 7414 }, { "epoch": 0.9432642157486325, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.695472717285156, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.864002525806427, "num_tokens": 282846735.0, "step": 7415 }, { "epoch": 0.943391426027223, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.4766731262207, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8776852488517761, "num_tokens": 282882242.0, "step": 7416 }, { "epoch": 0.9435186363058136, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.577301025390625, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8450796604156494, "num_tokens": 282918774.0, "step": 7417 }, { "epoch": 0.943645846584404, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.566139221191406, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8494307398796082, "num_tokens": 282957766.0, "step": 7418 }, { "epoch": 0.9437730568629945, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.5587043762207, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8451738357543945, "num_tokens": 282995980.0, "step": 7419 }, { "epoch": 0.943900267141585, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.3211555480957, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8585678935050964, "num_tokens": 283033148.0, "step": 7420 }, { "epoch": 0.9440274774201756, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.376319885253906, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8622213006019592, "num_tokens": 283069781.0, "step": 7421 }, { "epoch": 0.9441546876987661, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.516639709472656, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8493503928184509, "num_tokens": 283107270.0, "step": 7422 }, { "epoch": 0.9442818979773566, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.2274055480957, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.870166540145874, "num_tokens": 283143571.0, "step": 7423 }, { "epoch": 0.944409108255947, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.30550765991211, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8616691827774048, "num_tokens": 283179995.0, "step": 7424 }, { "epoch": 0.9445363185345376, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.462249755859375, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8578994870185852, "num_tokens": 283218370.0, "step": 7425 }, { "epoch": 0.9446635288131281, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.56737518310547, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8511388301849365, "num_tokens": 283247879.0, "step": 7426 }, { "epoch": 0.9447907390917186, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.5178337097168, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8728824257850647, "num_tokens": 283285027.0, "step": 7427 }, { "epoch": 0.9449179493703092, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.218692779541016, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8612399101257324, "num_tokens": 283324934.0, "step": 7428 }, { "epoch": 0.9450451596488997, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.47732925415039, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8649313449859619, "num_tokens": 283357540.0, "step": 7429 }, { "epoch": 0.9451723699274901, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.568260192871094, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8559018969535828, "num_tokens": 283393470.0, "step": 7430 }, { "epoch": 0.9452995802060806, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.3076286315918, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8589324951171875, "num_tokens": 283429916.0, "step": 7431 }, { "epoch": 0.9454267904846712, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.70011520385742, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8456029295921326, "num_tokens": 283467997.0, "step": 7432 }, { "epoch": 0.9455540007632617, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.48386001586914, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.8227742910385132, "num_tokens": 283504741.0, "step": 7433 }, { "epoch": 0.9456812110418522, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.910240173339844, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8676774501800537, "num_tokens": 283534535.0, "step": 7434 }, { "epoch": 0.9458084213204427, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.19194030761719, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8700612783432007, "num_tokens": 283571600.0, "step": 7435 }, { "epoch": 0.9459356315990332, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.75998306274414, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8450464606285095, "num_tokens": 283605662.0, "step": 7436 }, { "epoch": 0.9460628418776237, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.28007125854492, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.85838782787323, "num_tokens": 283642868.0, "step": 7437 }, { "epoch": 0.9461900521562142, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.242897033691406, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.848310649394989, "num_tokens": 283675604.0, "step": 7438 }, { "epoch": 0.9463172624348047, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.35648727416992, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8793535232543945, "num_tokens": 283709459.0, "step": 7439 }, { "epoch": 0.9464444727133953, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.32408905029297, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8576820492744446, "num_tokens": 283748020.0, "step": 7440 }, { "epoch": 0.9465716829919858, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.47426223754883, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.852167010307312, "num_tokens": 283778514.0, "step": 7441 }, { "epoch": 0.9466988932705762, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.58697509765625, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8611071705818176, "num_tokens": 283819764.0, "step": 7442 }, { "epoch": 0.9468261035491667, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.3682975769043, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8423263430595398, "num_tokens": 283853897.0, "step": 7443 }, { "epoch": 0.9469533138277573, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.729820251464844, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8711897134780884, "num_tokens": 283891826.0, "step": 7444 }, { "epoch": 0.9470805241063478, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.65703201293945, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8573216199874878, "num_tokens": 283931690.0, "step": 7445 }, { "epoch": 0.9472077343849383, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.61835479736328, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8574973344802856, "num_tokens": 283971107.0, "step": 7446 }, { "epoch": 0.9473349446635289, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.53485107421875, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8641791939735413, "num_tokens": 284005286.0, "step": 7447 }, { "epoch": 0.9474621549421193, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.7408447265625, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8721199035644531, "num_tokens": 284044089.0, "step": 7448 }, { "epoch": 0.9475893652207098, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.66472244262695, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8521238565444946, "num_tokens": 284081172.0, "step": 7449 }, { "epoch": 0.9477165754993003, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.8655891418457, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8652961254119873, "num_tokens": 284113362.0, "step": 7450 }, { "epoch": 0.9478437857778909, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.61140060424805, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8683981895446777, "num_tokens": 284155272.0, "step": 7451 }, { "epoch": 0.9479709960564814, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 34.11784362792969, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.865870475769043, "num_tokens": 284188218.0, "step": 7452 }, { "epoch": 0.9480982063350719, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.083274841308594, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8613991141319275, "num_tokens": 284228838.0, "step": 7453 }, { "epoch": 0.9482254166136623, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 34.22160720825195, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8550257086753845, "num_tokens": 284265570.0, "step": 7454 }, { "epoch": 0.9483526268922529, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.65803146362305, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8480403423309326, "num_tokens": 284307062.0, "step": 7455 }, { "epoch": 0.9484798371708434, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.83157730102539, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8556939363479614, "num_tokens": 284340488.0, "step": 7456 }, { "epoch": 0.9486070474494339, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.39316940307617, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8567175269126892, "num_tokens": 284382399.0, "step": 7457 }, { "epoch": 0.9487342577280244, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.45259475708008, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8446328043937683, "num_tokens": 284416391.0, "step": 7458 }, { "epoch": 0.948861468006615, "ewc_loss": 0.09521484375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.772445678710938e-05, "grad_norm": 33.42490768432617, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8584851026535034, "num_tokens": 284457049.0, "step": 7459 }, { "epoch": 0.9489886782852054, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.55493927001953, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8574582934379578, "num_tokens": 284494426.0, "step": 7460 }, { "epoch": 0.9491158885637959, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.572017669677734, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8761767148971558, "num_tokens": 284533349.0, "step": 7461 }, { "epoch": 0.9492430988423864, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.42041778564453, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8646215200424194, "num_tokens": 284567777.0, "step": 7462 }, { "epoch": 0.949370309120977, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.40179443359375, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8513813018798828, "num_tokens": 284609120.0, "step": 7463 }, { "epoch": 0.9494975193995675, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.76472091674805, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8509413003921509, "num_tokens": 284649221.0, "step": 7464 }, { "epoch": 0.949624729678158, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.3721923828125, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8629910945892334, "num_tokens": 284682595.0, "step": 7465 }, { "epoch": 0.9497519399567486, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.91676712036133, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8540769815444946, "num_tokens": 284722903.0, "step": 7466 }, { "epoch": 0.949879150235339, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.26176834106445, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8631774187088013, "num_tokens": 284759166.0, "step": 7467 }, { "epoch": 0.9500063605139295, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.94136428833008, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8538820147514343, "num_tokens": 284803192.0, "step": 7468 }, { "epoch": 0.95013357079252, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.456138610839844, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8604053854942322, "num_tokens": 284837088.0, "step": 7469 }, { "epoch": 0.9502607810711106, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.5937614440918, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8382900953292847, "num_tokens": 284879641.0, "step": 7470 }, { "epoch": 0.9503879913497011, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.49724197387695, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8472214341163635, "num_tokens": 284917519.0, "step": 7471 }, { "epoch": 0.9505152016282916, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.793861389160156, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8489744663238525, "num_tokens": 284950422.0, "step": 7472 }, { "epoch": 0.950642411906882, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.720863342285156, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8648309111595154, "num_tokens": 284986392.0, "step": 7473 }, { "epoch": 0.9507696221854726, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.658302307128906, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8597390651702881, "num_tokens": 285027684.0, "step": 7474 }, { "epoch": 0.9508968324640631, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 34.084712982177734, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8486003875732422, "num_tokens": 285059551.0, "step": 7475 }, { "epoch": 0.9510240427426536, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.56027603149414, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8597049117088318, "num_tokens": 285100197.0, "step": 7476 }, { "epoch": 0.9511512530212441, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 34.04611587524414, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8578695058822632, "num_tokens": 285130115.0, "step": 7477 }, { "epoch": 0.9512784632998347, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.10597229003906, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8589060306549072, "num_tokens": 285169069.0, "step": 7478 }, { "epoch": 0.9514056735784251, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 34.20901107788086, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8380295038223267, "num_tokens": 285209971.0, "step": 7479 }, { "epoch": 0.9515328838570156, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.552860260009766, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8524456024169922, "num_tokens": 285250024.0, "step": 7480 }, { "epoch": 0.9516600941356061, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.91001892089844, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8527123332023621, "num_tokens": 285284900.0, "step": 7481 }, { "epoch": 0.9517873044141967, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.695465087890625, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8632826805114746, "num_tokens": 285325657.0, "step": 7482 }, { "epoch": 0.9519145146927872, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.62651824951172, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8578243851661682, "num_tokens": 285356362.0, "step": 7483 }, { "epoch": 0.9520417249713777, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.61647033691406, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8579431772232056, "num_tokens": 285394321.0, "step": 7484 }, { "epoch": 0.9521689352499682, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.696685791015625, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8763428926467896, "num_tokens": 285435260.0, "step": 7485 }, { "epoch": 0.9522961455285587, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.679813385009766, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8659130334854126, "num_tokens": 285479449.0, "step": 7486 }, { "epoch": 0.9524233558071492, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.62575149536133, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8370264768600464, "num_tokens": 285520492.0, "step": 7487 }, { "epoch": 0.9525505660857397, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.54153823852539, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.854613184928894, "num_tokens": 285558410.0, "step": 7488 }, { "epoch": 0.9526777763643303, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.60186004638672, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.838064432144165, "num_tokens": 285598871.0, "step": 7489 }, { "epoch": 0.9528049866429208, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.67339324951172, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8495168685913086, "num_tokens": 285641374.0, "step": 7490 }, { "epoch": 0.9529321969215112, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.34772872924805, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8684778213500977, "num_tokens": 285675416.0, "step": 7491 }, { "epoch": 0.9530594072001017, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 34.18248748779297, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8636366128921509, "num_tokens": 285707491.0, "step": 7492 }, { "epoch": 0.9531866174786923, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.194862365722656, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8467761278152466, "num_tokens": 285747571.0, "step": 7493 }, { "epoch": 0.9533138277572828, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.93751525878906, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8815450668334961, "num_tokens": 285786362.0, "step": 7494 }, { "epoch": 0.9534410380358733, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.57422637939453, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8659241199493408, "num_tokens": 285829401.0, "step": 7495 }, { "epoch": 0.9535682483144639, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 34.06269073486328, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8554280996322632, "num_tokens": 285867853.0, "step": 7496 }, { "epoch": 0.9536954585930543, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.76007843017578, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8695353269577026, "num_tokens": 285908497.0, "step": 7497 }, { "epoch": 0.9538226688716448, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.7955322265625, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8443087339401245, "num_tokens": 285940528.0, "step": 7498 }, { "epoch": 0.9539498791502353, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.93195724487305, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8596636056900024, "num_tokens": 285980206.0, "step": 7499 }, { "epoch": 0.9540770894288259, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.49507141113281, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8518639802932739, "num_tokens": 286019409.0, "step": 7500 }, { "epoch": 0.9542042997074164, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.97954177856445, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.866227388381958, "num_tokens": 286050955.0, "step": 7501 }, { "epoch": 0.9543315099860069, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 33.428489685058594, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8632472157478333, "num_tokens": 286087567.0, "step": 7502 }, { "epoch": 0.9544587202645973, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.75149154663086, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8380486965179443, "num_tokens": 286126893.0, "step": 7503 }, { "epoch": 0.9545859305431879, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.75568771362305, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8521995544433594, "num_tokens": 286165818.0, "step": 7504 }, { "epoch": 0.9547131408217784, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.50376510620117, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8596823811531067, "num_tokens": 286208026.0, "step": 7505 }, { "epoch": 0.9548403511003689, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.21662139892578, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8567283749580383, "num_tokens": 286245468.0, "step": 7506 }, { "epoch": 0.9549675613789594, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.79450607299805, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8622201681137085, "num_tokens": 286283955.0, "step": 7507 }, { "epoch": 0.95509477165755, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.94463348388672, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8650928735733032, "num_tokens": 286326560.0, "step": 7508 }, { "epoch": 0.9552219819361404, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 34.080596923828125, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8576107025146484, "num_tokens": 286365791.0, "step": 7509 }, { "epoch": 0.9553491922147309, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.8854866027832, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8676895499229431, "num_tokens": 286404132.0, "step": 7510 }, { "epoch": 0.9554764024933214, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 34.04581832885742, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8475891351699829, "num_tokens": 286444828.0, "step": 7511 }, { "epoch": 0.955603612771912, "ewc_loss": 0.09716796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.963180541992188e-05, "grad_norm": 34.377994537353516, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8624869585037231, "num_tokens": 286481725.0, "step": 7512 }, { "epoch": 0.9557308230505025, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.701908111572266, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8586766719818115, "num_tokens": 286520354.0, "step": 7513 }, { "epoch": 0.955858033329093, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 34.40337371826172, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.856204628944397, "num_tokens": 286564314.0, "step": 7514 }, { "epoch": 0.9559852436076836, "ewc_loss": 0.0966796875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.915496826171875e-05, "grad_norm": 33.9244499206543, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8567583560943604, "num_tokens": 286606955.0, "step": 7515 }, { "epoch": 0.956112453886274, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 34.58529281616211, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.862094521522522, "num_tokens": 286640091.0, "step": 7516 }, { "epoch": 0.9562396641648645, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.68921661376953, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8566776514053345, "num_tokens": 286679090.0, "step": 7517 }, { "epoch": 0.956366874443455, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 34.52687454223633, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8604962825775146, "num_tokens": 286720260.0, "step": 7518 }, { "epoch": 0.9564940847220456, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.84337615966797, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8643406629562378, "num_tokens": 286763992.0, "step": 7519 }, { "epoch": 0.9566212950006361, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.43379592895508, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8382623791694641, "num_tokens": 286807068.0, "step": 7520 }, { "epoch": 0.9567485052792266, "ewc_loss": 0.095703125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.82012939453125e-05, "grad_norm": 33.82552719116211, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8649976849555969, "num_tokens": 286853020.0, "step": 7521 }, { "epoch": 0.956875715557817, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.37067413330078, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.847171425819397, "num_tokens": 286896098.0, "step": 7522 }, { "epoch": 0.9570029258364076, "ewc_loss": 0.09619140625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 7.867813110351562e-05, "grad_norm": 33.73473358154297, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8434616327285767, "num_tokens": 286938873.0, "step": 7523 }, { "epoch": 0.9571301361149981, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.14278030395508, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8622525334358215, "num_tokens": 286972941.0, "step": 7524 }, { "epoch": 0.9572573463935886, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.78298568725586, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8461707830429077, "num_tokens": 287010818.0, "step": 7525 }, { "epoch": 0.9573845566721791, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.7979621887207, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8615443706512451, "num_tokens": 287053841.0, "step": 7526 }, { "epoch": 0.9575117669507697, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.08464050292969, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8636763691902161, "num_tokens": 287090900.0, "step": 7527 }, { "epoch": 0.9576389772293601, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.446102142333984, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.855188250541687, "num_tokens": 287131203.0, "step": 7528 }, { "epoch": 0.9577661875079506, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.485408782958984, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8686904907226562, "num_tokens": 287165949.0, "step": 7529 }, { "epoch": 0.9578933977865411, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.493106842041016, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8579128384590149, "num_tokens": 287203376.0, "step": 7530 }, { "epoch": 0.9580206080651317, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.54743576049805, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8488943576812744, "num_tokens": 287240391.0, "step": 7531 }, { "epoch": 0.9581478183437222, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.479061126708984, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8544549942016602, "num_tokens": 287278485.0, "step": 7532 }, { "epoch": 0.9582750286223127, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.26127624511719, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8524280786514282, "num_tokens": 287313626.0, "step": 7533 }, { "epoch": 0.9584022389009031, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.849609375, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8731669783592224, "num_tokens": 287344532.0, "step": 7534 }, { "epoch": 0.9585294491794937, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.28438949584961, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8388025760650635, "num_tokens": 287383069.0, "step": 7535 }, { "epoch": 0.9586566594580842, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.433589935302734, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.857549786567688, "num_tokens": 287418554.0, "step": 7536 }, { "epoch": 0.9587838697366747, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.40432357788086, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8568878769874573, "num_tokens": 287452449.0, "step": 7537 }, { "epoch": 0.9589110800152653, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.80428695678711, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8428776264190674, "num_tokens": 287492247.0, "step": 7538 }, { "epoch": 0.9590382902938558, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.12548065185547, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8643349409103394, "num_tokens": 287528651.0, "step": 7539 }, { "epoch": 0.9591655005724462, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.491310119628906, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8477836847305298, "num_tokens": 287565837.0, "step": 7540 }, { "epoch": 0.9592927108510367, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.09435272216797, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8561767339706421, "num_tokens": 287599469.0, "step": 7541 }, { "epoch": 0.9594199211296273, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.81200408935547, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8557626008987427, "num_tokens": 287637104.0, "step": 7542 }, { "epoch": 0.9595471314082178, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.88484573364258, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8597176671028137, "num_tokens": 287679134.0, "step": 7543 }, { "epoch": 0.9596743416868083, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.81525802612305, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8637107610702515, "num_tokens": 287720340.0, "step": 7544 }, { "epoch": 0.9598015519653988, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.109683990478516, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8626046180725098, "num_tokens": 287760931.0, "step": 7545 }, { "epoch": 0.9599287622439893, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.6287841796875, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.84425950050354, "num_tokens": 287801723.0, "step": 7546 }, { "epoch": 0.9600559725225798, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.18845748901367, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.855232298374176, "num_tokens": 287837010.0, "step": 7547 }, { "epoch": 0.9601831828011703, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.700340270996094, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8583511114120483, "num_tokens": 287874381.0, "step": 7548 }, { "epoch": 0.9603103930797608, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.17579650878906, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8499743938446045, "num_tokens": 287916673.0, "step": 7549 }, { "epoch": 0.9604376033583514, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.77816390991211, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8534552454948425, "num_tokens": 287956764.0, "step": 7550 }, { "epoch": 0.9605648136369419, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.1153564453125, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8657566905021667, "num_tokens": 287996373.0, "step": 7551 }, { "epoch": 0.9606920239155323, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.9022102355957, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8529902696609497, "num_tokens": 288033671.0, "step": 7552 }, { "epoch": 0.9608192341941229, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.0838737487793, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8672150373458862, "num_tokens": 288068254.0, "step": 7553 }, { "epoch": 0.9609464444727134, "ewc_loss": 0.09814453125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.058547973632812e-05, "grad_norm": 33.79571533203125, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8432450294494629, "num_tokens": 288102833.0, "step": 7554 }, { "epoch": 0.9610736547513039, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.80460739135742, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8713903427124023, "num_tokens": 288135927.0, "step": 7555 }, { "epoch": 0.9612008650298944, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.87385559082031, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8782331943511963, "num_tokens": 288173842.0, "step": 7556 }, { "epoch": 0.961328075308485, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.94407272338867, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8632149696350098, "num_tokens": 288215007.0, "step": 7557 }, { "epoch": 0.9614552855870754, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.82798767089844, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8463147282600403, "num_tokens": 288251598.0, "step": 7558 }, { "epoch": 0.9615824958656659, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.71489715576172, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8591636419296265, "num_tokens": 288292811.0, "step": 7559 }, { "epoch": 0.9617097061442564, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.29154968261719, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8623249530792236, "num_tokens": 288337790.0, "step": 7560 }, { "epoch": 0.961836916422847, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.532833099365234, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8553845882415771, "num_tokens": 288373347.0, "step": 7561 }, { "epoch": 0.9619641267014375, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.9229736328125, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8782318234443665, "num_tokens": 288405650.0, "step": 7562 }, { "epoch": 0.962091336980028, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.76658630371094, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8646551966667175, "num_tokens": 288443849.0, "step": 7563 }, { "epoch": 0.9622185472586186, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.9667854309082, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8360326290130615, "num_tokens": 288487557.0, "step": 7564 }, { "epoch": 0.962345757537209, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.88871383666992, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8480933904647827, "num_tokens": 288529932.0, "step": 7565 }, { "epoch": 0.9624729678157995, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.80929183959961, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8539783954620361, "num_tokens": 288568276.0, "step": 7566 }, { "epoch": 0.96260017809439, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.79377746582031, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8613306283950806, "num_tokens": 288602387.0, "step": 7567 }, { "epoch": 0.9627273883729806, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.98341369628906, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8665043711662292, "num_tokens": 288638157.0, "step": 7568 }, { "epoch": 0.9628545986515711, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.87350082397461, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8535657525062561, "num_tokens": 288675054.0, "step": 7569 }, { "epoch": 0.9629818089301616, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.584232330322266, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8632178902626038, "num_tokens": 288714222.0, "step": 7570 }, { "epoch": 0.963109019208752, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.94500732421875, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8469520807266235, "num_tokens": 288752772.0, "step": 7571 }, { "epoch": 0.9632362294873426, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.99690628051758, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8409280180931091, "num_tokens": 288792067.0, "step": 7572 }, { "epoch": 0.9633634397659331, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.27342224121094, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8692305088043213, "num_tokens": 288826172.0, "step": 7573 }, { "epoch": 0.9634906500445236, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.748023986816406, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8693094253540039, "num_tokens": 288863774.0, "step": 7574 }, { "epoch": 0.9636178603231141, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.011959075927734, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8523067235946655, "num_tokens": 288901892.0, "step": 7575 }, { "epoch": 0.9637450706017047, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.64179992675781, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8475966453552246, "num_tokens": 288941470.0, "step": 7576 }, { "epoch": 0.9638722808802951, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.15699005126953, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8440797328948975, "num_tokens": 288988616.0, "step": 7577 }, { "epoch": 0.9639994911588856, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.888614654541016, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8758567571640015, "num_tokens": 289024589.0, "step": 7578 }, { "epoch": 0.9641267014374761, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.01644515991211, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8593252301216125, "num_tokens": 289064334.0, "step": 7579 }, { "epoch": 0.9642539117160667, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.689422607421875, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8450614213943481, "num_tokens": 289099385.0, "step": 7580 }, { "epoch": 0.9643811219946572, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.001373291015625, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8617892265319824, "num_tokens": 289139143.0, "step": 7581 }, { "epoch": 0.9645083322732477, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.044857025146484, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8524044752120972, "num_tokens": 289179303.0, "step": 7582 }, { "epoch": 0.9646355425518381, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.764305114746094, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.84967041015625, "num_tokens": 289214218.0, "step": 7583 }, { "epoch": 0.9647627528304287, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.93980407714844, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.854323148727417, "num_tokens": 289240991.0, "step": 7584 }, { "epoch": 0.9648899631090192, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.80077362060547, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8523061275482178, "num_tokens": 289273374.0, "step": 7585 }, { "epoch": 0.9650171733876097, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.66959762573242, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8445438146591187, "num_tokens": 289314904.0, "step": 7586 }, { "epoch": 0.9651443836662003, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.008460998535156, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8506651520729065, "num_tokens": 289358110.0, "step": 7587 }, { "epoch": 0.9652715939447908, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.782875061035156, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8489811420440674, "num_tokens": 289391581.0, "step": 7588 }, { "epoch": 0.9653988042233812, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.892826080322266, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8311342000961304, "num_tokens": 289432280.0, "step": 7589 }, { "epoch": 0.9655260145019717, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.15599060058594, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8418457508087158, "num_tokens": 289473222.0, "step": 7590 }, { "epoch": 0.9656532247805623, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.78496551513672, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8531489372253418, "num_tokens": 289512898.0, "step": 7591 }, { "epoch": 0.9657804350591528, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.889556884765625, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8616445064544678, "num_tokens": 289554394.0, "step": 7592 }, { "epoch": 0.9659076453377433, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.74057388305664, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8539493083953857, "num_tokens": 289594871.0, "step": 7593 }, { "epoch": 0.9660348556163338, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.70596694946289, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8468823432922363, "num_tokens": 289629165.0, "step": 7594 }, { "epoch": 0.9661620658949243, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.79875946044922, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8560047745704651, "num_tokens": 289670131.0, "step": 7595 }, { "epoch": 0.9662892761735148, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.60695266723633, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8592404127120972, "num_tokens": 289705238.0, "step": 7596 }, { "epoch": 0.9664164864521053, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.99460220336914, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.860883355140686, "num_tokens": 289740146.0, "step": 7597 }, { "epoch": 0.9665436967306958, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.68199157714844, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8614978790283203, "num_tokens": 289781145.0, "step": 7598 }, { "epoch": 0.9666709070092864, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.96079635620117, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8396742939949036, "num_tokens": 289816347.0, "step": 7599 }, { "epoch": 0.9667981172878769, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.754058837890625, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8435819149017334, "num_tokens": 289852974.0, "step": 7600 }, { "epoch": 0.9669253275664673, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.88923645019531, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8457061052322388, "num_tokens": 289894553.0, "step": 7601 }, { "epoch": 0.9670525378450578, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.75364685058594, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8381839990615845, "num_tokens": 289930641.0, "step": 7602 }, { "epoch": 0.9671797481236484, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.91168212890625, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8503808379173279, "num_tokens": 289972690.0, "step": 7603 }, { "epoch": 0.9673069584022389, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.84041213989258, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.850786566734314, "num_tokens": 290006518.0, "step": 7604 }, { "epoch": 0.9674341686808294, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.881404876708984, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.84406578540802, "num_tokens": 290045206.0, "step": 7605 }, { "epoch": 0.96756137895942, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.949249267578125, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8617119193077087, "num_tokens": 290086976.0, "step": 7606 }, { "epoch": 0.9676885892380104, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.51064682006836, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8708480596542358, "num_tokens": 290126894.0, "step": 7607 }, { "epoch": 0.9678157995166009, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.056270599365234, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8722421526908875, "num_tokens": 290161684.0, "step": 7608 }, { "epoch": 0.9679430097951914, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.96623611450195, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8590030670166016, "num_tokens": 290203407.0, "step": 7609 }, { "epoch": 0.968070220073782, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.95045471191406, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8640333414077759, "num_tokens": 290238645.0, "step": 7610 }, { "epoch": 0.9681974303523725, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.0518684387207, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8581261038780212, "num_tokens": 290280279.0, "step": 7611 }, { "epoch": 0.968324640630963, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.02070999145508, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8560069799423218, "num_tokens": 290317739.0, "step": 7612 }, { "epoch": 0.9684518509095535, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.606571197509766, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8549408316612244, "num_tokens": 290360384.0, "step": 7613 }, { "epoch": 0.968579061188144, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.29581832885742, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8632950782775879, "num_tokens": 290398706.0, "step": 7614 }, { "epoch": 0.9687062714667345, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.543792724609375, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8702359199523926, "num_tokens": 290439524.0, "step": 7615 }, { "epoch": 0.968833481745325, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.212730407714844, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8515471816062927, "num_tokens": 290472125.0, "step": 7616 }, { "epoch": 0.9689606920239155, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.714054107666016, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8405364751815796, "num_tokens": 290514348.0, "step": 7617 }, { "epoch": 0.9690879023025061, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 33.82583236694336, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8507946729660034, "num_tokens": 290551699.0, "step": 7618 }, { "epoch": 0.9692151125810966, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 33.75664520263672, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8582602739334106, "num_tokens": 290588644.0, "step": 7619 }, { "epoch": 0.969342322859687, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 33.86726760864258, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8576458096504211, "num_tokens": 290629546.0, "step": 7620 }, { "epoch": 0.9694695331382776, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.05354690551758, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8654317855834961, "num_tokens": 290665527.0, "step": 7621 }, { "epoch": 0.9695967434168681, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.713951110839844, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8484313488006592, "num_tokens": 290706667.0, "step": 7622 }, { "epoch": 0.9697239536954586, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.03555679321289, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8515055775642395, "num_tokens": 290745698.0, "step": 7623 }, { "epoch": 0.9698511639740491, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.600399017333984, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8630245923995972, "num_tokens": 290777983.0, "step": 7624 }, { "epoch": 0.9699783742526397, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.03248977661133, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8519896864891052, "num_tokens": 290814184.0, "step": 7625 }, { "epoch": 0.9701055845312301, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 33.69794845581055, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8490793704986572, "num_tokens": 290854054.0, "step": 7626 }, { "epoch": 0.9702327948098206, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 33.9796142578125, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8550286293029785, "num_tokens": 290891325.0, "step": 7627 }, { "epoch": 0.9703600050884111, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.96342849731445, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8551598191261292, "num_tokens": 290930101.0, "step": 7628 }, { "epoch": 0.9704872153670017, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.07538604736328, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8570208549499512, "num_tokens": 290964691.0, "step": 7629 }, { "epoch": 0.9706144256455922, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.97222137451172, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8546220660209656, "num_tokens": 291008140.0, "step": 7630 }, { "epoch": 0.9707416359241827, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.12062454223633, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8524448871612549, "num_tokens": 291041484.0, "step": 7631 }, { "epoch": 0.9708688462027731, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.84211730957031, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8388358354568481, "num_tokens": 291082337.0, "step": 7632 }, { "epoch": 0.9709960564813637, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.09881591796875, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8562145233154297, "num_tokens": 291115921.0, "step": 7633 }, { "epoch": 0.9711232667599542, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.05377197265625, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8667872548103333, "num_tokens": 291154367.0, "step": 7634 }, { "epoch": 0.9712504770385447, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.033634185791016, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8572794198989868, "num_tokens": 291198919.0, "step": 7635 }, { "epoch": 0.9713776873171353, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.042457580566406, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.844649612903595, "num_tokens": 291237062.0, "step": 7636 }, { "epoch": 0.9715048975957258, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.37517166137695, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.869297206401825, "num_tokens": 291275676.0, "step": 7637 }, { "epoch": 0.9716321078743162, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.93217086791992, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8486844301223755, "num_tokens": 291316785.0, "step": 7638 }, { "epoch": 0.9717593181529067, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.28264617919922, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8536529541015625, "num_tokens": 291360448.0, "step": 7639 }, { "epoch": 0.9718865284314973, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.872032165527344, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8530565500259399, "num_tokens": 291401650.0, "step": 7640 }, { "epoch": 0.9720137387100878, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.48979949951172, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8570707440376282, "num_tokens": 291439736.0, "step": 7641 }, { "epoch": 0.9721409489886783, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.5708122253418, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8620356321334839, "num_tokens": 291479077.0, "step": 7642 }, { "epoch": 0.9722681592672688, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.253013610839844, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8663831949234009, "num_tokens": 291512470.0, "step": 7643 }, { "epoch": 0.9723953695458593, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.211944580078125, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8686138391494751, "num_tokens": 291552797.0, "step": 7644 }, { "epoch": 0.9725225798244498, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.185752868652344, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8610811829566956, "num_tokens": 291589885.0, "step": 7645 }, { "epoch": 0.9726497901030403, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.16402816772461, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.855154275894165, "num_tokens": 291633005.0, "step": 7646 }, { "epoch": 0.9727770003816308, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.20992660522461, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8531454205513, "num_tokens": 291666083.0, "step": 7647 }, { "epoch": 0.9729042106602214, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.2676887512207, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8631918430328369, "num_tokens": 291706421.0, "step": 7648 }, { "epoch": 0.9730314209388119, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.31155776977539, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8617399334907532, "num_tokens": 291744641.0, "step": 7649 }, { "epoch": 0.9731586312174023, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.646034240722656, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.882256269454956, "num_tokens": 291780586.0, "step": 7650 }, { "epoch": 0.9732858414959928, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.841609954833984, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8536607623100281, "num_tokens": 291820482.0, "step": 7651 }, { "epoch": 0.9734130517745834, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.461483001708984, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8671759366989136, "num_tokens": 291858568.0, "step": 7652 }, { "epoch": 0.9735402620531739, "ewc_loss": 0.09765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.0108642578125e-05, "grad_norm": 33.8000373840332, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8478590250015259, "num_tokens": 291889383.0, "step": 7653 }, { "epoch": 0.9736674723317644, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.28742599487305, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8665453195571899, "num_tokens": 291927541.0, "step": 7654 }, { "epoch": 0.973794682610355, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.03501892089844, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.840907871723175, "num_tokens": 291969074.0, "step": 7655 }, { "epoch": 0.9739218928889454, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.21767044067383, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8544794321060181, "num_tokens": 292008311.0, "step": 7656 }, { "epoch": 0.9740491031675359, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.131072998046875, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.847867488861084, "num_tokens": 292051810.0, "step": 7657 }, { "epoch": 0.9741763134461264, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.281654357910156, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8627740144729614, "num_tokens": 292090411.0, "step": 7658 }, { "epoch": 0.974303523724717, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 33.70616149902344, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8485422134399414, "num_tokens": 292124411.0, "step": 7659 }, { "epoch": 0.9744307340033075, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.30830764770508, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.855766773223877, "num_tokens": 292166428.0, "step": 7660 }, { "epoch": 0.974557944281898, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.82441711425781, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8587346076965332, "num_tokens": 292202687.0, "step": 7661 }, { "epoch": 0.9746851545604885, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.37517547607422, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8653997182846069, "num_tokens": 292246364.0, "step": 7662 }, { "epoch": 0.974812364839079, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.952388763427734, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8581855297088623, "num_tokens": 292285789.0, "step": 7663 }, { "epoch": 0.9749395751176695, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.326107025146484, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8519486784934998, "num_tokens": 292320755.0, "step": 7664 }, { "epoch": 0.97506678539626, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.23457717895508, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8669252395629883, "num_tokens": 292362818.0, "step": 7665 }, { "epoch": 0.9751939956748505, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.22142028808594, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8483418226242065, "num_tokens": 292404016.0, "step": 7666 }, { "epoch": 0.9753212059534411, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.38929748535156, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8427928686141968, "num_tokens": 292442117.0, "step": 7667 }, { "epoch": 0.9754484162320316, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.83565139770508, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8638688325881958, "num_tokens": 292472523.0, "step": 7668 }, { "epoch": 0.975575626510622, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.4927864074707, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8602138161659241, "num_tokens": 292508571.0, "step": 7669 }, { "epoch": 0.9757028367892125, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.01778793334961, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8552536368370056, "num_tokens": 292545358.0, "step": 7670 }, { "epoch": 0.9758300470678031, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.513099670410156, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8408820629119873, "num_tokens": 292583962.0, "step": 7671 }, { "epoch": 0.9759572573463936, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.04941177368164, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8555864095687866, "num_tokens": 292616186.0, "step": 7672 }, { "epoch": 0.9760844676249841, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.28893280029297, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8527274131774902, "num_tokens": 292657378.0, "step": 7673 }, { "epoch": 0.9762116779035747, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 33.99094772338867, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8706633448600769, "num_tokens": 292692041.0, "step": 7674 }, { "epoch": 0.9763388881821651, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.32518768310547, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8540845513343811, "num_tokens": 292725062.0, "step": 7675 }, { "epoch": 0.9764660984607556, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.110965728759766, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8633857369422913, "num_tokens": 292766522.0, "step": 7676 }, { "epoch": 0.9765933087393461, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.38770294189453, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8562722206115723, "num_tokens": 292811371.0, "step": 7677 }, { "epoch": 0.9767205190179367, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.14509582519531, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8570196628570557, "num_tokens": 292850237.0, "step": 7678 }, { "epoch": 0.9768477292965272, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.513710021972656, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.860670268535614, "num_tokens": 292887291.0, "step": 7679 }, { "epoch": 0.9769749395751177, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.071964263916016, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8692464828491211, "num_tokens": 292928396.0, "step": 7680 }, { "epoch": 0.9771021498537081, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.62834930419922, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8607404232025146, "num_tokens": 292962748.0, "step": 7681 }, { "epoch": 0.9772293601322987, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 33.95315170288086, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8685745000839233, "num_tokens": 293002330.0, "step": 7682 }, { "epoch": 0.9773565704108892, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.5574951171875, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8571397662162781, "num_tokens": 293038288.0, "step": 7683 }, { "epoch": 0.9774837806894797, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.0240364074707, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8593908548355103, "num_tokens": 293078121.0, "step": 7684 }, { "epoch": 0.9776109909680702, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.23117446899414, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8543344736099243, "num_tokens": 293116846.0, "step": 7685 }, { "epoch": 0.9777382012466608, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.42373275756836, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8542733788490295, "num_tokens": 293162237.0, "step": 7686 }, { "epoch": 0.9778654115252512, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.61273956298828, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8547300100326538, "num_tokens": 293196631.0, "step": 7687 }, { "epoch": 0.9779926218038417, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.050819396972656, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8681001663208008, "num_tokens": 293238883.0, "step": 7688 }, { "epoch": 0.9781198320824323, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.78165054321289, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8485727310180664, "num_tokens": 293270802.0, "step": 7689 }, { "epoch": 0.9782470423610228, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.23142623901367, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8448252081871033, "num_tokens": 293314910.0, "step": 7690 }, { "epoch": 0.9783742526396133, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.638240814208984, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8709797263145447, "num_tokens": 293353083.0, "step": 7691 }, { "epoch": 0.9785014629182038, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.36034393310547, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8589495420455933, "num_tokens": 293389473.0, "step": 7692 }, { "epoch": 0.9786286731967943, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.266544342041016, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8712937831878662, "num_tokens": 293426893.0, "step": 7693 }, { "epoch": 0.9787558834753848, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.26884078979492, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8641405701637268, "num_tokens": 293468116.0, "step": 7694 }, { "epoch": 0.9788830937539753, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.31858825683594, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8597642183303833, "num_tokens": 293503050.0, "step": 7695 }, { "epoch": 0.9790103040325658, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.58356857299805, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8473871946334839, "num_tokens": 293541673.0, "step": 7696 }, { "epoch": 0.9791375143111564, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.075897216796875, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.85478276014328, "num_tokens": 293578472.0, "step": 7697 }, { "epoch": 0.9792647245897469, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.79021072387695, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8587349653244019, "num_tokens": 293611204.0, "step": 7698 }, { "epoch": 0.9793919348683373, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.412567138671875, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8468241691589355, "num_tokens": 293651352.0, "step": 7699 }, { "epoch": 0.9795191451469278, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.82592010498047, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8685224652290344, "num_tokens": 293687396.0, "step": 7700 }, { "epoch": 0.9796463554255184, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.524742126464844, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8530656695365906, "num_tokens": 293725083.0, "step": 7701 }, { "epoch": 0.9797735657041089, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.426292419433594, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8570606708526611, "num_tokens": 293766136.0, "step": 7702 }, { "epoch": 0.9799007759826994, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.64494323730469, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8412631750106812, "num_tokens": 293805810.0, "step": 7703 }, { "epoch": 0.98002798626129, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.35686492919922, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8493176698684692, "num_tokens": 293839041.0, "step": 7704 }, { "epoch": 0.9801551965398804, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.70215606689453, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.872374951839447, "num_tokens": 293876541.0, "step": 7705 }, { "epoch": 0.9802824068184709, "ewc_loss": 0.10009765625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.249282836914062e-05, "grad_norm": 34.412742614746094, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8710936307907104, "num_tokens": 293918141.0, "step": 7706 }, { "epoch": 0.9804096170970614, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.420310974121094, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8444398641586304, "num_tokens": 293958078.0, "step": 7707 }, { "epoch": 0.980536827375652, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.23550033569336, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8671283721923828, "num_tokens": 293990347.0, "step": 7708 }, { "epoch": 0.9806640376542425, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.76770782470703, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8632211685180664, "num_tokens": 294032136.0, "step": 7709 }, { "epoch": 0.980791247932833, "ewc_loss": 0.0986328125, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.106231689453125e-05, "grad_norm": 34.13987731933594, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8445179462432861, "num_tokens": 294065136.0, "step": 7710 }, { "epoch": 0.9809184582114235, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.46290969848633, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8721615672111511, "num_tokens": 294104539.0, "step": 7711 }, { "epoch": 0.981045668490014, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.55575180053711, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8549628853797913, "num_tokens": 294148962.0, "step": 7712 }, { "epoch": 0.9811728787686045, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.307464599609375, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8614097833633423, "num_tokens": 294181866.0, "step": 7713 }, { "epoch": 0.981300089047195, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.6588134765625, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8662163615226746, "num_tokens": 294223558.0, "step": 7714 }, { "epoch": 0.9814272993257855, "ewc_loss": 0.09912109375, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 8.153915405273438e-05, "grad_norm": 34.46819305419922, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.862025797367096, "num_tokens": 294260487.0, "step": 7715 }, { "epoch": 0.9815545096043761, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.39958953857422, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8586754202842712, "num_tokens": 294302642.0, "step": 7716 }, { "epoch": 0.9816817198829666, "ewc_loss": 0.099609375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.20159912109375e-05, "grad_norm": 34.12369155883789, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8477785587310791, "num_tokens": 294338376.0, "step": 7717 }, { "epoch": 0.981808930161557, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.352813720703125, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8509610891342163, "num_tokens": 294372089.0, "step": 7718 }, { "epoch": 0.9819361404401475, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.37697982788086, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8626578450202942, "num_tokens": 294403801.0, "step": 7719 }, { "epoch": 0.9820633507187381, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.229122161865234, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8611181974411011, "num_tokens": 294440301.0, "step": 7720 }, { "epoch": 0.9821905609973286, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.13316345214844, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8576160669326782, "num_tokens": 294472967.0, "step": 7721 }, { "epoch": 0.9823177712759191, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.170719146728516, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.846774697303772, "num_tokens": 294518248.0, "step": 7722 }, { "epoch": 0.9824449815545097, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.24506759643555, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8546783924102783, "num_tokens": 294553899.0, "step": 7723 }, { "epoch": 0.9825721918331001, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.19508743286133, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8355762958526611, "num_tokens": 294591382.0, "step": 7724 }, { "epoch": 0.9826994021116906, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.0318603515625, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8577829003334045, "num_tokens": 294629606.0, "step": 7725 }, { "epoch": 0.9828266123902811, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.34040451049805, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.860090970993042, "num_tokens": 294669417.0, "step": 7726 }, { "epoch": 0.9829538226688717, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.090087890625, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8680307269096375, "num_tokens": 294709876.0, "step": 7727 }, { "epoch": 0.9830810329474622, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.93585968017578, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8491547107696533, "num_tokens": 294750565.0, "step": 7728 }, { "epoch": 0.9832082432260527, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.31190490722656, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8583822846412659, "num_tokens": 294790586.0, "step": 7729 }, { "epoch": 0.9833354535046431, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.42306137084961, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8552433848381042, "num_tokens": 294830176.0, "step": 7730 }, { "epoch": 0.9834626637832337, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.24223709106445, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8445067405700684, "num_tokens": 294874199.0, "step": 7731 }, { "epoch": 0.9835898740618242, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.22962188720703, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8642821311950684, "num_tokens": 294914219.0, "step": 7732 }, { "epoch": 0.9837170843404147, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.19667053222656, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.835761308670044, "num_tokens": 294950726.0, "step": 7733 }, { "epoch": 0.9838442946190052, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.43204879760742, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8586995601654053, "num_tokens": 294993815.0, "step": 7734 }, { "epoch": 0.9839715048975958, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.11433792114258, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8658750057220459, "num_tokens": 295029275.0, "step": 7735 }, { "epoch": 0.9840987151761862, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.432533264160156, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8574835062026978, "num_tokens": 295066166.0, "step": 7736 }, { "epoch": 0.9842259254547767, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 33.90751266479492, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8348183631896973, "num_tokens": 295100436.0, "step": 7737 }, { "epoch": 0.9843531357333672, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.5657844543457, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8581886291503906, "num_tokens": 295141756.0, "step": 7738 }, { "epoch": 0.9844803460119578, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.19384002685547, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8749471306800842, "num_tokens": 295178894.0, "step": 7739 }, { "epoch": 0.9846075562905483, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.500728607177734, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8471885919570923, "num_tokens": 295219133.0, "step": 7740 }, { "epoch": 0.9847347665691388, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.349510192871094, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.859097957611084, "num_tokens": 295260322.0, "step": 7741 }, { "epoch": 0.9848619768477292, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.501739501953125, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8507407903671265, "num_tokens": 295299675.0, "step": 7742 }, { "epoch": 0.9849891871263198, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.204566955566406, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8634767532348633, "num_tokens": 295336281.0, "step": 7743 }, { "epoch": 0.9851163974049103, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.52508544921875, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8560017943382263, "num_tokens": 295376932.0, "step": 7744 }, { "epoch": 0.9852436076835008, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.117462158203125, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8546879291534424, "num_tokens": 295421904.0, "step": 7745 }, { "epoch": 0.9853708179620914, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.34794235229492, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8585304021835327, "num_tokens": 295456688.0, "step": 7746 }, { "epoch": 0.9854980282406819, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.228206634521484, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.874923825263977, "num_tokens": 295500022.0, "step": 7747 }, { "epoch": 0.9856252385192723, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.2274284362793, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8785510063171387, "num_tokens": 295538106.0, "step": 7748 }, { "epoch": 0.9857524487978628, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.20195770263672, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8533424139022827, "num_tokens": 295572465.0, "step": 7749 }, { "epoch": 0.9858796590764534, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.2493896484375, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8634323477745056, "num_tokens": 295614577.0, "step": 7750 }, { "epoch": 0.9860068693550439, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.99732971191406, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8614311218261719, "num_tokens": 295654780.0, "step": 7751 }, { "epoch": 0.9861340796336344, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.46340560913086, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8577227592468262, "num_tokens": 295695880.0, "step": 7752 }, { "epoch": 0.986261289912225, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.171295166015625, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8574711084365845, "num_tokens": 295738176.0, "step": 7753 }, { "epoch": 0.9863885001908154, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.483097076416016, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8464217185974121, "num_tokens": 295777562.0, "step": 7754 }, { "epoch": 0.9865157104694059, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.225704193115234, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8742547035217285, "num_tokens": 295815013.0, "step": 7755 }, { "epoch": 0.9866429207479964, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.36281204223633, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8460162878036499, "num_tokens": 295857751.0, "step": 7756 }, { "epoch": 0.986770131026587, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.21505355834961, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8633377552032471, "num_tokens": 295896033.0, "step": 7757 }, { "epoch": 0.9868973413051775, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.3891716003418, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8594040870666504, "num_tokens": 295936838.0, "step": 7758 }, { "epoch": 0.987024551583768, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.4350700378418, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8627035021781921, "num_tokens": 295972307.0, "step": 7759 }, { "epoch": 0.9871517618623584, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.169349670410156, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8551849126815796, "num_tokens": 296009966.0, "step": 7760 }, { "epoch": 0.987278972140949, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.49763488769531, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.855044960975647, "num_tokens": 296047874.0, "step": 7761 }, { "epoch": 0.9874061824195395, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.12016677856445, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8652783632278442, "num_tokens": 296090591.0, "step": 7762 }, { "epoch": 0.98753339269813, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.509761810302734, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8517827987670898, "num_tokens": 296133024.0, "step": 7763 }, { "epoch": 0.9876606029767205, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 33.96525573730469, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8460747599601746, "num_tokens": 296177347.0, "step": 7764 }, { "epoch": 0.9877878132553111, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.66592025756836, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8483776450157166, "num_tokens": 296215532.0, "step": 7765 }, { "epoch": 0.9879150235339016, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.10026168823242, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8459511399269104, "num_tokens": 296249494.0, "step": 7766 }, { "epoch": 0.988042233812492, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 33.89466857910156, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8485955595970154, "num_tokens": 296289370.0, "step": 7767 }, { "epoch": 0.9881694440910825, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.34306716918945, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.858964204788208, "num_tokens": 296329376.0, "step": 7768 }, { "epoch": 0.9882966543696731, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.04713821411133, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8635292649269104, "num_tokens": 296368875.0, "step": 7769 }, { "epoch": 0.9884238646482636, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.576541900634766, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8572680354118347, "num_tokens": 296404766.0, "step": 7770 }, { "epoch": 0.9885510749268541, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.125038146972656, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8558379411697388, "num_tokens": 296441705.0, "step": 7771 }, { "epoch": 0.9886782852054447, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.69797134399414, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8641939163208008, "num_tokens": 296481075.0, "step": 7772 }, { "epoch": 0.9888054954840351, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.22197341918945, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8530613780021667, "num_tokens": 296511404.0, "step": 7773 }, { "epoch": 0.9889327057626256, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.584716796875, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8479711413383484, "num_tokens": 296546080.0, "step": 7774 }, { "epoch": 0.9890599160412161, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.080535888671875, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8597017526626587, "num_tokens": 296586119.0, "step": 7775 }, { "epoch": 0.9891871263198067, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.528236389160156, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8555678129196167, "num_tokens": 296629602.0, "step": 7776 }, { "epoch": 0.9893143365983972, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.183624267578125, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8506983518600464, "num_tokens": 296666709.0, "step": 7777 }, { "epoch": 0.9894415468769877, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.46487045288086, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8568602800369263, "num_tokens": 296702611.0, "step": 7778 }, { "epoch": 0.9895687571555781, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.9795036315918, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.862133264541626, "num_tokens": 296737649.0, "step": 7779 }, { "epoch": 0.9896959674341687, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.49602127075195, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8596922159194946, "num_tokens": 296776915.0, "step": 7780 }, { "epoch": 0.9898231777127592, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.09505081176758, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8638575673103333, "num_tokens": 296811491.0, "step": 7781 }, { "epoch": 0.9899503879913497, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.54411697387695, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8622688055038452, "num_tokens": 296847602.0, "step": 7782 }, { "epoch": 0.9900775982699402, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.377952575683594, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8408198356628418, "num_tokens": 296884737.0, "step": 7783 }, { "epoch": 0.9902048085485308, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.20087814331055, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8455124497413635, "num_tokens": 296927780.0, "step": 7784 }, { "epoch": 0.9903320188271212, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.62838363647461, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8708468079566956, "num_tokens": 296964200.0, "step": 7785 }, { "epoch": 0.9904592291057117, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.18403625488281, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8495168685913086, "num_tokens": 297009927.0, "step": 7786 }, { "epoch": 0.9905864393843022, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.06080627441406, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8437772989273071, "num_tokens": 297050788.0, "step": 7787 }, { "epoch": 0.9907136496628928, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.89463806152344, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8632196187973022, "num_tokens": 297092398.0, "step": 7788 }, { "epoch": 0.9908408599414833, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.033634185791016, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8548912405967712, "num_tokens": 297130728.0, "step": 7789 }, { "epoch": 0.9909680702200738, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.19257736206055, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8808519244194031, "num_tokens": 297169124.0, "step": 7790 }, { "epoch": 0.9910952804986642, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.87720489501953, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8671274185180664, "num_tokens": 297204914.0, "step": 7791 }, { "epoch": 0.9912224907772548, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.0949821472168, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8543526530265808, "num_tokens": 297240551.0, "step": 7792 }, { "epoch": 0.9913497010558453, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.82038497924805, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8490710258483887, "num_tokens": 297287160.0, "step": 7793 }, { "epoch": 0.9914769113344358, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.229347229003906, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8582479953765869, "num_tokens": 297323394.0, "step": 7794 }, { "epoch": 0.9916041216130264, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.7435188293457, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8355532884597778, "num_tokens": 297357808.0, "step": 7795 }, { "epoch": 0.9917313318916169, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.39093780517578, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8512343168258667, "num_tokens": 297397300.0, "step": 7796 }, { "epoch": 0.9918585421702073, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.23047637939453, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8663991689682007, "num_tokens": 297434699.0, "step": 7797 }, { "epoch": 0.9919857524487978, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.481101989746094, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8567811250686646, "num_tokens": 297473939.0, "step": 7798 }, { "epoch": 0.9921129627273884, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.19222640991211, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8493252992630005, "num_tokens": 297511154.0, "step": 7799 }, { "epoch": 0.9922401730059789, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.80454635620117, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8491284847259521, "num_tokens": 297556092.0, "step": 7800 }, { "epoch": 0.9923673832845694, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.25674819946289, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8624141216278076, "num_tokens": 297598160.0, "step": 7801 }, { "epoch": 0.9924945935631599, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.645938873291016, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8730226755142212, "num_tokens": 297638706.0, "step": 7802 }, { "epoch": 0.9926218038417504, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.32090377807617, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8618550300598145, "num_tokens": 297675955.0, "step": 7803 }, { "epoch": 0.9927490141203409, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.61051559448242, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8527743816375732, "num_tokens": 297715362.0, "step": 7804 }, { "epoch": 0.9928762243989314, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.26695251464844, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8435492515563965, "num_tokens": 297750931.0, "step": 7805 }, { "epoch": 0.993003434677522, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.48117446899414, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8501741886138916, "num_tokens": 297786107.0, "step": 7806 }, { "epoch": 0.9931306449561125, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.61795425415039, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.855750322341919, "num_tokens": 297824782.0, "step": 7807 }, { "epoch": 0.993257855234703, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.378822326660156, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8544914722442627, "num_tokens": 297861748.0, "step": 7808 }, { "epoch": 0.9933850655132934, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.376625061035156, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.857251763343811, "num_tokens": 297902722.0, "step": 7809 }, { "epoch": 0.993512275791884, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.1103515625, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8601906299591064, "num_tokens": 297948709.0, "step": 7810 }, { "epoch": 0.9936394860704745, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.49549865722656, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8599562644958496, "num_tokens": 297986446.0, "step": 7811 }, { "epoch": 0.993766696349065, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.3836669921875, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8498317003250122, "num_tokens": 298024004.0, "step": 7812 }, { "epoch": 0.9938939066276555, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.19023132324219, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8479280471801758, "num_tokens": 298065518.0, "step": 7813 }, { "epoch": 0.9940211169062461, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.59162521362305, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8502482175827026, "num_tokens": 298108138.0, "step": 7814 }, { "epoch": 0.9941483271848366, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.22762680053711, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.85660719871521, "num_tokens": 298149348.0, "step": 7815 }, { "epoch": 0.994275537463427, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.96226119995117, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8554007411003113, "num_tokens": 298186347.0, "step": 7816 }, { "epoch": 0.9944027477420175, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 33.98773193359375, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8357078433036804, "num_tokens": 298223322.0, "step": 7817 }, { "epoch": 0.9945299580206081, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.04389190673828, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8509501218795776, "num_tokens": 298258794.0, "step": 7818 }, { "epoch": 0.9946571682991986, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.19439697265625, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8491756916046143, "num_tokens": 298297510.0, "step": 7819 }, { "epoch": 0.9947843785777891, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.82366180419922, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8604763746261597, "num_tokens": 298333784.0, "step": 7820 }, { "epoch": 0.9949115888563796, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.407745361328125, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8616785407066345, "num_tokens": 298371241.0, "step": 7821 }, { "epoch": 0.9950387991349701, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.52988052368164, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8424234390258789, "num_tokens": 298407902.0, "step": 7822 }, { "epoch": 0.9951660094135606, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.627681732177734, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.830841600894928, "num_tokens": 298452339.0, "step": 7823 }, { "epoch": 0.9952932196921511, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.4483528137207, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8476145267486572, "num_tokens": 298495854.0, "step": 7824 }, { "epoch": 0.9954204299707416, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.235687255859375, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8504167795181274, "num_tokens": 298529642.0, "step": 7825 }, { "epoch": 0.9955476402493322, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.56463623046875, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8590203523635864, "num_tokens": 298574396.0, "step": 7826 }, { "epoch": 0.9956748505279227, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.559661865234375, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8436673283576965, "num_tokens": 298612457.0, "step": 7827 }, { "epoch": 0.9958020608065131, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.19917297363281, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.856104850769043, "num_tokens": 298645061.0, "step": 7828 }, { "epoch": 0.9959292710851037, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.42677307128906, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8541561961174011, "num_tokens": 298688014.0, "step": 7829 }, { "epoch": 0.9960564813636942, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.3383903503418, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.860167384147644, "num_tokens": 298727815.0, "step": 7830 }, { "epoch": 0.9961836916422847, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.130218505859375, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8762186765670776, "num_tokens": 298765663.0, "step": 7831 }, { "epoch": 0.9963109019208752, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.576602935791016, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.859697699546814, "num_tokens": 298799360.0, "step": 7832 }, { "epoch": 0.9964381121994658, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.240501403808594, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8570315837860107, "num_tokens": 298835669.0, "step": 7833 }, { "epoch": 0.9965653224780562, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.511375427246094, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8541786670684814, "num_tokens": 298866871.0, "step": 7834 }, { "epoch": 0.9966925327566467, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.40656280517578, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8480806350708008, "num_tokens": 298904136.0, "step": 7835 }, { "epoch": 0.9968197430352372, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.15885925292969, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8684139251708984, "num_tokens": 298937069.0, "step": 7836 }, { "epoch": 0.9969469533138278, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.31845474243164, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8479026556015015, "num_tokens": 298976778.0, "step": 7837 }, { "epoch": 0.9970741635924183, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.46636199951172, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8663077354431152, "num_tokens": 299010808.0, "step": 7838 }, { "epoch": 0.9972013738710088, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.359222412109375, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8444907665252686, "num_tokens": 299047611.0, "step": 7839 }, { "epoch": 0.9973285841495992, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.80667495727539, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8620759844779968, "num_tokens": 299088668.0, "step": 7840 }, { "epoch": 0.9974557944281898, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.643856048583984, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8359728455543518, "num_tokens": 299126572.0, "step": 7841 }, { "epoch": 0.9975830047067803, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.59975051879883, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8687543869018555, "num_tokens": 299164806.0, "step": 7842 }, { "epoch": 0.9977102149853708, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.54952621459961, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8533704876899719, "num_tokens": 299207039.0, "step": 7843 }, { "epoch": 0.9978374252639614, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 35.115264892578125, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8665561676025391, "num_tokens": 299243242.0, "step": 7844 }, { "epoch": 0.9979646355425519, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.258731842041016, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8628130555152893, "num_tokens": 299282326.0, "step": 7845 }, { "epoch": 0.9980918458211423, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.22643280029297, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8857136964797974, "num_tokens": 299316645.0, "step": 7846 }, { "epoch": 0.9982190560997328, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.665428161621094, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.859520673751831, "num_tokens": 299349932.0, "step": 7847 }, { "epoch": 0.9983462663783234, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.59174728393555, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8342754244804382, "num_tokens": 299392452.0, "step": 7848 }, { "epoch": 0.9984734766569139, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.966590881347656, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8630068302154541, "num_tokens": 299431479.0, "step": 7849 }, { "epoch": 0.9986006869355044, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.204315185546875, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8553798198699951, "num_tokens": 299475585.0, "step": 7850 }, { "epoch": 0.9987278972140949, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 35.093746185302734, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8714410662651062, "num_tokens": 299514026.0, "step": 7851 }, { "epoch": 0.9988551074926854, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.527557373046875, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8472815752029419, "num_tokens": 299550858.0, "step": 7852 }, { "epoch": 0.9989823177712759, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.57958984375, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8504552841186523, "num_tokens": 299586831.0, "step": 7853 }, { "epoch": 0.9991095280498664, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.689117431640625, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8667237758636475, "num_tokens": 299623116.0, "step": 7854 }, { "epoch": 0.9992367383284569, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.71359634399414, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8596301078796387, "num_tokens": 299662165.0, "step": 7855 }, { "epoch": 0.9993639486070475, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.44970703125, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8533782958984375, "num_tokens": 299702581.0, "step": 7856 }, { "epoch": 0.999491158885638, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 35.12079620361328, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8425899147987366, "num_tokens": 299742559.0, "step": 7857 }, { "epoch": 0.9996183691642284, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.59518051147461, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8730556964874268, "num_tokens": 299778637.0, "step": 7858 }, { "epoch": 0.9997455794428189, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.749393463134766, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8660649657249451, "num_tokens": 299812808.0, "step": 7859 }, { "epoch": 0.9998727897214095, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.65290832519531, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8538975715637207, "num_tokens": 299848987.0, "step": 7860 }, { "epoch": 1.0, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.73939514160156, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8393963575363159, "num_tokens": 299886286.0, "step": 7861 }, { "epoch": 1.0001272102785905, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.7102165222168, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8496360182762146, "num_tokens": 299925456.0, "step": 7862 }, { "epoch": 1.000254420557181, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.68185043334961, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8530475497245789, "num_tokens": 299965936.0, "step": 7863 }, { "epoch": 1.0003816308357716, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.42643356323242, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8683309555053711, "num_tokens": 300003181.0, "step": 7864 }, { "epoch": 1.0005088411143621, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.737369537353516, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8552435040473938, "num_tokens": 300040502.0, "step": 7865 }, { "epoch": 1.0006360513929526, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.70687484741211, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8719667196273804, "num_tokens": 300078795.0, "step": 7866 }, { "epoch": 1.0007632616715432, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.198280334472656, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8671383857727051, "num_tokens": 300115188.0, "step": 7867 }, { "epoch": 1.0008904719501335, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.819942474365234, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8585417866706848, "num_tokens": 300158583.0, "step": 7868 }, { "epoch": 1.001017682228724, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.46525573730469, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8611099720001221, "num_tokens": 300194415.0, "step": 7869 }, { "epoch": 1.0011448925073145, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.794166564941406, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8525539040565491, "num_tokens": 300235066.0, "step": 7870 }, { "epoch": 1.001272102785905, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.615562438964844, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8596720695495605, "num_tokens": 300272167.0, "step": 7871 }, { "epoch": 1.0013993130644956, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.76422882080078, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8494484424591064, "num_tokens": 300306032.0, "step": 7872 }, { "epoch": 1.0015265233430861, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.959877014160156, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8538949489593506, "num_tokens": 300338433.0, "step": 7873 }, { "epoch": 1.0016537336216766, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.81608200073242, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8456583619117737, "num_tokens": 300376527.0, "step": 7874 }, { "epoch": 1.0017809439002672, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 35.085819244384766, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8723720908164978, "num_tokens": 300417309.0, "step": 7875 }, { "epoch": 1.0019081541788577, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.46226119995117, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8570095896720886, "num_tokens": 300459438.0, "step": 7876 }, { "epoch": 1.0020353644574482, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.86773681640625, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8345012664794922, "num_tokens": 300501443.0, "step": 7877 }, { "epoch": 1.0021625747360388, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.58695983886719, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8638471961021423, "num_tokens": 300531918.0, "step": 7878 }, { "epoch": 1.0022897850146293, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.61783981323242, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8502846360206604, "num_tokens": 300575084.0, "step": 7879 }, { "epoch": 1.0024169952932196, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.91188430786133, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8570421934127808, "num_tokens": 300608737.0, "step": 7880 }, { "epoch": 1.0025442055718101, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.361183166503906, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8442822098731995, "num_tokens": 300647855.0, "step": 7881 }, { "epoch": 1.0026714158504006, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.82516860961914, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8571909666061401, "num_tokens": 300689763.0, "step": 7882 }, { "epoch": 1.0027986261289912, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.42619705200195, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8686429262161255, "num_tokens": 300725838.0, "step": 7883 }, { "epoch": 1.0029258364075817, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.24055862426758, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8638493418693542, "num_tokens": 300760423.0, "step": 7884 }, { "epoch": 1.0030530466861722, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.41187286376953, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.86334228515625, "num_tokens": 300796236.0, "step": 7885 }, { "epoch": 1.0031802569647628, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.16660690307617, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8629299998283386, "num_tokens": 300835213.0, "step": 7886 }, { "epoch": 1.0033074672433533, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.30024337768555, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8644616007804871, "num_tokens": 300873363.0, "step": 7887 }, { "epoch": 1.0034346775219438, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.39928436279297, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8529502749443054, "num_tokens": 300910233.0, "step": 7888 }, { "epoch": 1.0035618878005343, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.56343078613281, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8663794994354248, "num_tokens": 300946986.0, "step": 7889 }, { "epoch": 1.0036890980791249, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.20322799682617, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.859606146812439, "num_tokens": 300985750.0, "step": 7890 }, { "epoch": 1.0038163083577154, "ewc_loss": 0.1005859375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.296966552734375e-05, "grad_norm": 34.64776611328125, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8573129773139954, "num_tokens": 301032820.0, "step": 7891 }, { "epoch": 1.0039435186363057, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 35.10537338256836, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8471700549125671, "num_tokens": 301070825.0, "step": 7892 }, { "epoch": 1.0040707289148962, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.6020622253418, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8545241355895996, "num_tokens": 301110764.0, "step": 7893 }, { "epoch": 1.0041979391934868, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.80175018310547, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.861207127571106, "num_tokens": 301152421.0, "step": 7894 }, { "epoch": 1.0043251494720773, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.434696197509766, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8583863973617554, "num_tokens": 301192780.0, "step": 7895 }, { "epoch": 1.0044523597506678, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.89887237548828, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8541464805603027, "num_tokens": 301228147.0, "step": 7896 }, { "epoch": 1.0045795700292584, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.95662307739258, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8684825897216797, "num_tokens": 301259568.0, "step": 7897 }, { "epoch": 1.0047067803078489, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.856998443603516, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8578565120697021, "num_tokens": 301296483.0, "step": 7898 }, { "epoch": 1.0048339905864394, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.90443420410156, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8599466681480408, "num_tokens": 301337437.0, "step": 7899 }, { "epoch": 1.00496120086503, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.658321380615234, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.856544017791748, "num_tokens": 301368777.0, "step": 7900 }, { "epoch": 1.0050884111436205, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 35.06704330444336, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8545262813568115, "num_tokens": 301404885.0, "step": 7901 }, { "epoch": 1.005215621422211, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.561153411865234, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8465695381164551, "num_tokens": 301445308.0, "step": 7902 }, { "epoch": 1.0053428317008015, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.79654312133789, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8540750741958618, "num_tokens": 301484845.0, "step": 7903 }, { "epoch": 1.0054700419793918, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.533714294433594, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8551843166351318, "num_tokens": 301524498.0, "step": 7904 }, { "epoch": 1.0055972522579824, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 35.067344665527344, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8632304668426514, "num_tokens": 301565522.0, "step": 7905 }, { "epoch": 1.0057244625365729, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.447898864746094, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8536584377288818, "num_tokens": 301603724.0, "step": 7906 }, { "epoch": 1.0058516728151634, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.89643859863281, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8550441265106201, "num_tokens": 301636650.0, "step": 7907 }, { "epoch": 1.005978883093754, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.70310974121094, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8469032645225525, "num_tokens": 301675328.0, "step": 7908 }, { "epoch": 1.0061060933723445, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.13434600830078, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8501679301261902, "num_tokens": 301710058.0, "step": 7909 }, { "epoch": 1.006233303650935, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.52143096923828, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8500155210494995, "num_tokens": 301753567.0, "step": 7910 }, { "epoch": 1.0063605139295255, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.0107421875, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8651987910270691, "num_tokens": 301792895.0, "step": 7911 }, { "epoch": 1.006487724208116, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.80009460449219, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8702393770217896, "num_tokens": 301833001.0, "step": 7912 }, { "epoch": 1.0066149344867066, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.721012115478516, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8602941036224365, "num_tokens": 301871623.0, "step": 7913 }, { "epoch": 1.006742144765297, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.58736801147461, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8679369688034058, "num_tokens": 301903750.0, "step": 7914 }, { "epoch": 1.0068693550438876, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.96260070800781, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8600293397903442, "num_tokens": 301943079.0, "step": 7915 }, { "epoch": 1.0069965653224782, "ewc_loss": 0.10107421875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.344650268554688e-05, "grad_norm": 34.339969635009766, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8626303672790527, "num_tokens": 301980031.0, "step": 7916 }, { "epoch": 1.0071237756010685, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.5875244140625, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8569064140319824, "num_tokens": 302020454.0, "step": 7917 }, { "epoch": 1.007250985879659, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.80878829956055, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.851248025894165, "num_tokens": 302064391.0, "step": 7918 }, { "epoch": 1.0073781961582495, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.61698532104492, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8573017120361328, "num_tokens": 302101598.0, "step": 7919 }, { "epoch": 1.00750540643684, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.76982498168945, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8631873726844788, "num_tokens": 302138803.0, "step": 7920 }, { "epoch": 1.0076326167154306, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.69236755371094, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8554326891899109, "num_tokens": 302173682.0, "step": 7921 }, { "epoch": 1.0077598269940211, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.677490234375, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8552478551864624, "num_tokens": 302207390.0, "step": 7922 }, { "epoch": 1.0078870372726116, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.96894454956055, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8617004156112671, "num_tokens": 302244440.0, "step": 7923 }, { "epoch": 1.0080142475512022, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.686771392822266, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8519549369812012, "num_tokens": 302281058.0, "step": 7924 }, { "epoch": 1.0081414578297927, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.93227767944336, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8751225471496582, "num_tokens": 302319586.0, "step": 7925 }, { "epoch": 1.0082686681083832, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.37855911254883, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8532674312591553, "num_tokens": 302361356.0, "step": 7926 }, { "epoch": 1.0083958783869738, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.95835876464844, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8666045665740967, "num_tokens": 302399974.0, "step": 7927 }, { "epoch": 1.0085230886655643, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.6272087097168, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.86177659034729, "num_tokens": 302435709.0, "step": 7928 }, { "epoch": 1.0086502989441546, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.95794677734375, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8539056777954102, "num_tokens": 302474151.0, "step": 7929 }, { "epoch": 1.0087775092227451, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.6815071105957, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8591136932373047, "num_tokens": 302509877.0, "step": 7930 }, { "epoch": 1.0089047195013356, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.30075454711914, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8728342056274414, "num_tokens": 302547689.0, "step": 7931 }, { "epoch": 1.0090319297799262, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.2515869140625, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8555777668952942, "num_tokens": 302590728.0, "step": 7932 }, { "epoch": 1.0091591400585167, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.16693878173828, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8337996006011963, "num_tokens": 302625709.0, "step": 7933 }, { "epoch": 1.0092863503371072, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.68111038208008, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8638991713523865, "num_tokens": 302662086.0, "step": 7934 }, { "epoch": 1.0094135606156978, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.0919303894043, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8743789792060852, "num_tokens": 302703680.0, "step": 7935 }, { "epoch": 1.0095407708942883, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.657196044921875, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8410247564315796, "num_tokens": 302740426.0, "step": 7936 }, { "epoch": 1.0096679811728788, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.94673156738281, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.871226966381073, "num_tokens": 302780778.0, "step": 7937 }, { "epoch": 1.0097951914514693, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 35.7614860534668, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8626391887664795, "num_tokens": 302816012.0, "step": 7938 }, { "epoch": 1.0099224017300599, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.89017868041992, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8347256183624268, "num_tokens": 302858325.0, "step": 7939 }, { "epoch": 1.0100496120086504, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.78371810913086, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8592768907546997, "num_tokens": 302898210.0, "step": 7940 }, { "epoch": 1.0101768222872407, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.709922790527344, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8593112230300903, "num_tokens": 302936072.0, "step": 7941 }, { "epoch": 1.0103040325658312, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.94239044189453, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8490293025970459, "num_tokens": 302978941.0, "step": 7942 }, { "epoch": 1.0104312428444218, "ewc_loss": 0.10205078125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.556053161621094, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8499447107315063, "num_tokens": 303016664.0, "step": 7943 }, { "epoch": 1.0105584531230123, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 50.524017333984375, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8535204529762268, "num_tokens": 303063313.0, "step": 7944 }, { "epoch": 1.0106856634016028, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.40763473510742, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8471547365188599, "num_tokens": 303103786.0, "step": 7945 }, { "epoch": 1.0108128736801933, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.568626403808594, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.864079475402832, "num_tokens": 303136928.0, "step": 7946 }, { "epoch": 1.0109400839587839, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 35.04410934448242, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.865211546421051, "num_tokens": 303174854.0, "step": 7947 }, { "epoch": 1.0110672942373744, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 36.0030517578125, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8551703691482544, "num_tokens": 303214234.0, "step": 7948 }, { "epoch": 1.011194504515965, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.847412109375, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8708465099334717, "num_tokens": 303251830.0, "step": 7949 }, { "epoch": 1.0113217147945555, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.20896911621094, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8476333618164062, "num_tokens": 303291564.0, "step": 7950 }, { "epoch": 1.011448925073146, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.3065299987793, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.854080080986023, "num_tokens": 303328642.0, "step": 7951 }, { "epoch": 1.0115761353517365, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.877532958984375, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8560782670974731, "num_tokens": 303373693.0, "step": 7952 }, { "epoch": 1.0117033456303268, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.254032135009766, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8622933626174927, "num_tokens": 303407124.0, "step": 7953 }, { "epoch": 1.0118305559089174, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.16678237915039, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8691931962966919, "num_tokens": 303441586.0, "step": 7954 }, { "epoch": 1.0119577661875079, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.766082763671875, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8587655425071716, "num_tokens": 303473593.0, "step": 7955 }, { "epoch": 1.0120849764660984, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.6032600402832, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8536075353622437, "num_tokens": 303510239.0, "step": 7956 }, { "epoch": 1.012212186744689, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.8388786315918, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8560813665390015, "num_tokens": 303544709.0, "step": 7957 }, { "epoch": 1.0123393970232795, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.67168045043945, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8682312965393066, "num_tokens": 303581990.0, "step": 7958 }, { "epoch": 1.01246660730187, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.960636138916016, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8601013422012329, "num_tokens": 303625167.0, "step": 7959 }, { "epoch": 1.0125938175804605, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.47905731201172, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.857795000076294, "num_tokens": 303664751.0, "step": 7960 }, { "epoch": 1.012721027859051, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.92092514038086, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8552159667015076, "num_tokens": 303702190.0, "step": 7961 }, { "epoch": 1.0128482381376416, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.69212341308594, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8716384172439575, "num_tokens": 303742872.0, "step": 7962 }, { "epoch": 1.012975448416232, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.01041030883789, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8519181609153748, "num_tokens": 303781265.0, "step": 7963 }, { "epoch": 1.0131026586948226, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.550533294677734, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8580293655395508, "num_tokens": 303820686.0, "step": 7964 }, { "epoch": 1.0132298689734132, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.985958099365234, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8608547449111938, "num_tokens": 303863428.0, "step": 7965 }, { "epoch": 1.0133570792520035, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.9095458984375, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.852349579334259, "num_tokens": 303903847.0, "step": 7966 }, { "epoch": 1.013484289530594, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.66592788696289, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8501198291778564, "num_tokens": 303943083.0, "step": 7967 }, { "epoch": 1.0136114998091845, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.73817825317383, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8665181994438171, "num_tokens": 303978773.0, "step": 7968 }, { "epoch": 1.013738710087775, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.762481689453125, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.865199089050293, "num_tokens": 304017058.0, "step": 7969 }, { "epoch": 1.0138659203663656, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.84343338012695, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8486841320991516, "num_tokens": 304054104.0, "step": 7970 }, { "epoch": 1.013993130644956, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.91464614868164, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.87082439661026, "num_tokens": 304091100.0, "step": 7971 }, { "epoch": 1.0141203409235466, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.55535125732422, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8582711815834045, "num_tokens": 304127747.0, "step": 7972 }, { "epoch": 1.0142475512021372, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.93801498413086, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8578263521194458, "num_tokens": 304171089.0, "step": 7973 }, { "epoch": 1.0143747614807277, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.54526901245117, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8499670624732971, "num_tokens": 304210207.0, "step": 7974 }, { "epoch": 1.0145019717593182, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.94725036621094, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8745119571685791, "num_tokens": 304249423.0, "step": 7975 }, { "epoch": 1.0146291820379088, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.737911224365234, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.862022340297699, "num_tokens": 304286872.0, "step": 7976 }, { "epoch": 1.0147563923164993, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.6209831237793, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8621840476989746, "num_tokens": 304323033.0, "step": 7977 }, { "epoch": 1.0148836025950896, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.054107666015625, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8611671924591064, "num_tokens": 304360096.0, "step": 7978 }, { "epoch": 1.0150108128736801, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.70318603515625, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8784221410751343, "num_tokens": 304396364.0, "step": 7979 }, { "epoch": 1.0151380231522706, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.072750091552734, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8621493577957153, "num_tokens": 304431583.0, "step": 7980 }, { "epoch": 1.0152652334308612, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.92098617553711, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8521002531051636, "num_tokens": 304470531.0, "step": 7981 }, { "epoch": 1.0153924437094517, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.79336166381836, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8634471893310547, "num_tokens": 304505187.0, "step": 7982 }, { "epoch": 1.0155196539880422, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.88290023803711, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8359693884849548, "num_tokens": 304542685.0, "step": 7983 }, { "epoch": 1.0156468642666328, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.945491790771484, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8583704233169556, "num_tokens": 304579204.0, "step": 7984 }, { "epoch": 1.0157740745452233, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.7456169128418, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8632354736328125, "num_tokens": 304615224.0, "step": 7985 }, { "epoch": 1.0159012848238138, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.26280975341797, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8533098697662354, "num_tokens": 304653513.0, "step": 7986 }, { "epoch": 1.0160284951024043, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.92540740966797, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8636497259140015, "num_tokens": 304686744.0, "step": 7987 }, { "epoch": 1.0161557053809949, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.101409912109375, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.857475996017456, "num_tokens": 304727262.0, "step": 7988 }, { "epoch": 1.0162829156595854, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.14846420288086, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8557552695274353, "num_tokens": 304763418.0, "step": 7989 }, { "epoch": 1.0164101259381757, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 35.132232666015625, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8625377416610718, "num_tokens": 304800763.0, "step": 7990 }, { "epoch": 1.0165373362167662, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 35.05717086791992, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8762142658233643, "num_tokens": 304833136.0, "step": 7991 }, { "epoch": 1.0166645464953568, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.608211517333984, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8664593696594238, "num_tokens": 304873208.0, "step": 7992 }, { "epoch": 1.0167917567739473, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.600162506103516, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8591793775558472, "num_tokens": 304914237.0, "step": 7993 }, { "epoch": 1.0169189670525378, "ewc_loss": 0.1015625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.392333984375e-05, "grad_norm": 34.342552185058594, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8734451532363892, "num_tokens": 304955453.0, "step": 7994 }, { "epoch": 1.0170461773311283, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.24172592163086, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.856621503829956, "num_tokens": 304990533.0, "step": 7995 }, { "epoch": 1.0171733876097189, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.440017700195312e-05, "grad_norm": 34.616172790527344, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8578959703445435, "num_tokens": 305024849.0, "step": 7996 }, { "epoch": 1.0173005978883094, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.8784065246582, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.854084849357605, "num_tokens": 305061615.0, "step": 7997 }, { "epoch": 1.0174278081669, "ewc_loss": 0.1025390625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 34.538944244384766, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.839043140411377, "num_tokens": 305102251.0, "step": 7998 }, { "epoch": 1.0175550184454905, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.706729888916016, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8687053322792053, "num_tokens": 305135405.0, "step": 7999 }, { "epoch": 1.017682228724081, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.56395721435547, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8573862314224243, "num_tokens": 305169005.0, "step": 8000 }, { "epoch": 1.0178094390026715, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.647857666015625, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8536754846572876, "num_tokens": 305209050.0, "step": 8001 }, { "epoch": 1.0179366492812618, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.57999038696289, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8598714470863342, "num_tokens": 305238250.0, "step": 8002 }, { "epoch": 1.0180638595598523, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.94476318359375, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8621912598609924, "num_tokens": 305283544.0, "step": 8003 }, { "epoch": 1.0181910698384429, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.674251556396484, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8411881923675537, "num_tokens": 305320548.0, "step": 8004 }, { "epoch": 1.0183182801170334, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.76895523071289, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8658139705657959, "num_tokens": 305359560.0, "step": 8005 }, { "epoch": 1.018445490395624, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 34.8658561706543, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.855750322341919, "num_tokens": 305396680.0, "step": 8006 }, { "epoch": 1.0185727006742145, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.89518356323242, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8684588670730591, "num_tokens": 305432856.0, "step": 8007 }, { "epoch": 1.018699910952805, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.56635284423828, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8610653877258301, "num_tokens": 305469973.0, "step": 8008 }, { "epoch": 1.0188271212313955, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.51056671142578, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8644140362739563, "num_tokens": 305505576.0, "step": 8009 }, { "epoch": 1.018954331509986, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.701568603515625, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.837832510471344, "num_tokens": 305542037.0, "step": 8010 }, { "epoch": 1.0190815417885766, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.66084289550781, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8675298690795898, "num_tokens": 305587784.0, "step": 8011 }, { "epoch": 1.019208752067167, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.644073486328125, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8516054153442383, "num_tokens": 305621888.0, "step": 8012 }, { "epoch": 1.0193359623457576, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.56928634643555, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8773937821388245, "num_tokens": 305663844.0, "step": 8013 }, { "epoch": 1.0194631726243482, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.8765754699707, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8730459213256836, "num_tokens": 305695913.0, "step": 8014 }, { "epoch": 1.0195903829029385, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.52296447753906, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8554545044898987, "num_tokens": 305733632.0, "step": 8015 }, { "epoch": 1.019717593181529, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.769439697265625, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8448860049247742, "num_tokens": 305766917.0, "step": 8016 }, { "epoch": 1.0198448034601195, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.70775604248047, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8545833230018616, "num_tokens": 305805768.0, "step": 8017 }, { "epoch": 1.01997201373871, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.72468948364258, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.860926628112793, "num_tokens": 305843412.0, "step": 8018 }, { "epoch": 1.0200992240173006, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.75110626220703, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8744407296180725, "num_tokens": 305880523.0, "step": 8019 }, { "epoch": 1.020226434295891, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.667667388916016, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8600902557373047, "num_tokens": 305915379.0, "step": 8020 }, { "epoch": 1.0203536445744816, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.861976623535156, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8728342056274414, "num_tokens": 305952471.0, "step": 8021 }, { "epoch": 1.0204808548530722, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.8316535949707, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8580130338668823, "num_tokens": 305994023.0, "step": 8022 }, { "epoch": 1.0206080651316627, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.638389587402344, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8646552562713623, "num_tokens": 306026563.0, "step": 8023 }, { "epoch": 1.0207352754102532, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.09720993041992, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8550770282745361, "num_tokens": 306067913.0, "step": 8024 }, { "epoch": 1.0208624856888437, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.82672119140625, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8582528829574585, "num_tokens": 306108692.0, "step": 8025 }, { "epoch": 1.0209896959674343, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.902488708496094, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8588394522666931, "num_tokens": 306148082.0, "step": 8026 }, { "epoch": 1.0211169062460246, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.570804595947266, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8656008243560791, "num_tokens": 306182800.0, "step": 8027 }, { "epoch": 1.021244116524615, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.827789306640625, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8569940328598022, "num_tokens": 306226741.0, "step": 8028 }, { "epoch": 1.0213713268032056, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.90052032470703, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8570370078086853, "num_tokens": 306265993.0, "step": 8029 }, { "epoch": 1.0214985370817962, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.63090515136719, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8620578646659851, "num_tokens": 306306511.0, "step": 8030 }, { "epoch": 1.0216257473603867, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.9451904296875, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8739620447158813, "num_tokens": 306337687.0, "step": 8031 }, { "epoch": 1.0217529576389772, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.658016204833984, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8244014978408813, "num_tokens": 306379646.0, "step": 8032 }, { "epoch": 1.0218801679175677, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.06719970703125, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8595669269561768, "num_tokens": 306415389.0, "step": 8033 }, { "epoch": 1.0220073781961583, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.48421096801758, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8530955910682678, "num_tokens": 306452035.0, "step": 8034 }, { "epoch": 1.0221345884747488, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.9268913269043, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8577488660812378, "num_tokens": 306496277.0, "step": 8035 }, { "epoch": 1.0222617987533393, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.520198822021484, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8654400110244751, "num_tokens": 306528212.0, "step": 8036 }, { "epoch": 1.0223890090319299, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.10763931274414, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8620527982711792, "num_tokens": 306562047.0, "step": 8037 }, { "epoch": 1.0225162193105204, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.67441940307617, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.848716676235199, "num_tokens": 306601298.0, "step": 8038 }, { "epoch": 1.0226434295891107, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.8758659362793, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8581330180168152, "num_tokens": 306640961.0, "step": 8039 }, { "epoch": 1.0227706398677012, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.64990997314453, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8776144981384277, "num_tokens": 306679507.0, "step": 8040 }, { "epoch": 1.0228978501462918, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.16035461425781, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8496809005737305, "num_tokens": 306717337.0, "step": 8041 }, { "epoch": 1.0230250604248823, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.982059478759766, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8316294550895691, "num_tokens": 306754152.0, "step": 8042 }, { "epoch": 1.0231522707034728, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.644962310791016, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8524938821792603, "num_tokens": 306794887.0, "step": 8043 }, { "epoch": 1.0232794809820633, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.72160339355469, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8629574775695801, "num_tokens": 306828374.0, "step": 8044 }, { "epoch": 1.0234066912606539, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.86854934692383, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8624141216278076, "num_tokens": 306865285.0, "step": 8045 }, { "epoch": 1.0235339015392444, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.2563591003418, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8637017011642456, "num_tokens": 306897195.0, "step": 8046 }, { "epoch": 1.023661111817835, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.31396484375, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8569554090499878, "num_tokens": 306935352.0, "step": 8047 }, { "epoch": 1.0237883220964255, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.29877853393555, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.863962709903717, "num_tokens": 306964621.0, "step": 8048 }, { "epoch": 1.023915532375016, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.87270736694336, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8580217361450195, "num_tokens": 307003451.0, "step": 8049 }, { "epoch": 1.0240427426536065, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.779151916503906, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8597111105918884, "num_tokens": 307044695.0, "step": 8050 }, { "epoch": 1.0241699529321968, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.824920654296875, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8659454584121704, "num_tokens": 307080603.0, "step": 8051 }, { "epoch": 1.0242971632107873, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.68367385864258, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8511142730712891, "num_tokens": 307122300.0, "step": 8052 }, { "epoch": 1.0244243734893779, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.104000091552734, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8652132749557495, "num_tokens": 307158685.0, "step": 8053 }, { "epoch": 1.0245515837679684, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.33144760131836, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8642906546592712, "num_tokens": 307199104.0, "step": 8054 }, { "epoch": 1.024678794046559, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.15336608886719, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8522895574569702, "num_tokens": 307235840.0, "step": 8055 }, { "epoch": 1.0248060043251495, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.38017654418945, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8472645878791809, "num_tokens": 307273089.0, "step": 8056 }, { "epoch": 1.02493321460374, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.253360748291016, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8599711656570435, "num_tokens": 307318825.0, "step": 8057 }, { "epoch": 1.0250604248823305, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.44240188598633, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8657678961753845, "num_tokens": 307357006.0, "step": 8058 }, { "epoch": 1.025187635160921, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.71245193481445, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.857586681842804, "num_tokens": 307390274.0, "step": 8059 }, { "epoch": 1.0253148454395116, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.16676712036133, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8669185042381287, "num_tokens": 307427157.0, "step": 8060 }, { "epoch": 1.025442055718102, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.56973648071289, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8539927005767822, "num_tokens": 307466555.0, "step": 8061 }, { "epoch": 1.0255692659966926, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.0490608215332, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8498008847236633, "num_tokens": 307503375.0, "step": 8062 }, { "epoch": 1.0256964762752832, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.705787658691406, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8689976930618286, "num_tokens": 307543170.0, "step": 8063 }, { "epoch": 1.0258236865538735, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.15196990966797, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8555135726928711, "num_tokens": 307582890.0, "step": 8064 }, { "epoch": 1.025950896832464, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.73448944091797, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8516778945922852, "num_tokens": 307617643.0, "step": 8065 }, { "epoch": 1.0260781071110545, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.40526580810547, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8562891483306885, "num_tokens": 307652258.0, "step": 8066 }, { "epoch": 1.026205317389645, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.40703201293945, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8537092208862305, "num_tokens": 307693245.0, "step": 8067 }, { "epoch": 1.0263325276682356, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.21748352050781, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8622525930404663, "num_tokens": 307728751.0, "step": 8068 }, { "epoch": 1.026459737946826, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.54531478881836, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8467239141464233, "num_tokens": 307767869.0, "step": 8069 }, { "epoch": 1.0265869482254166, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.78194808959961, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8689299821853638, "num_tokens": 307809438.0, "step": 8070 }, { "epoch": 1.0267141585040072, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.56400680541992, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8382980823516846, "num_tokens": 307848752.0, "step": 8071 }, { "epoch": 1.0268413687825977, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.966949462890625, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8518635034561157, "num_tokens": 307886080.0, "step": 8072 }, { "epoch": 1.0269685790611882, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.692779541015625, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8720841407775879, "num_tokens": 307922854.0, "step": 8073 }, { "epoch": 1.0270957893397787, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.99706268310547, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8550611734390259, "num_tokens": 307960032.0, "step": 8074 }, { "epoch": 1.0272229996183693, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.774635314941406, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8592600226402283, "num_tokens": 307996074.0, "step": 8075 }, { "epoch": 1.0273502098969596, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.840641021728516, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8723204731941223, "num_tokens": 308030833.0, "step": 8076 }, { "epoch": 1.02747742017555, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.90946578979492, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8679531812667847, "num_tokens": 308079620.0, "step": 8077 }, { "epoch": 1.0276046304541406, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.041378021240234, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8517698049545288, "num_tokens": 308118555.0, "step": 8078 }, { "epoch": 1.0277318407327312, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.71345138549805, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.842163622379303, "num_tokens": 308154410.0, "step": 8079 }, { "epoch": 1.0278590510113217, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.25221633911133, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.859626829624176, "num_tokens": 308191990.0, "step": 8080 }, { "epoch": 1.0279862612899122, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.63969421386719, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8377896547317505, "num_tokens": 308231047.0, "step": 8081 }, { "epoch": 1.0281134715685027, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.15010452270508, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.873868465423584, "num_tokens": 308269487.0, "step": 8082 }, { "epoch": 1.0282406818470933, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.6583251953125, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8577793836593628, "num_tokens": 308304974.0, "step": 8083 }, { "epoch": 1.0283678921256838, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.00314712524414, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8802753686904907, "num_tokens": 308342072.0, "step": 8084 }, { "epoch": 1.0284951024042743, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.999576568603516, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8540220260620117, "num_tokens": 308374293.0, "step": 8085 }, { "epoch": 1.0286223126828649, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.7606315612793, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8621426224708557, "num_tokens": 308415872.0, "step": 8086 }, { "epoch": 1.0287495229614554, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.117332458496094, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8523497581481934, "num_tokens": 308451198.0, "step": 8087 }, { "epoch": 1.0288767332400457, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.897891998291016, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8659805059432983, "num_tokens": 308490554.0, "step": 8088 }, { "epoch": 1.0290039435186362, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.08564758300781, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8527546525001526, "num_tokens": 308530221.0, "step": 8089 }, { "epoch": 1.0291311537972267, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.9808235168457, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8429458141326904, "num_tokens": 308563634.0, "step": 8090 }, { "epoch": 1.0292583640758173, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.446311950683594, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8586701154708862, "num_tokens": 308605033.0, "step": 8091 }, { "epoch": 1.0293855743544078, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.22972869873047, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8592591285705566, "num_tokens": 308642603.0, "step": 8092 }, { "epoch": 1.0295127846329983, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.422664642333984, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8544061183929443, "num_tokens": 308685189.0, "step": 8093 }, { "epoch": 1.0296399949115889, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.2089958190918, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8586275577545166, "num_tokens": 308726052.0, "step": 8094 }, { "epoch": 1.0297672051901794, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.91691970825195, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.845491886138916, "num_tokens": 308764828.0, "step": 8095 }, { "epoch": 1.02989441546877, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.05693435668945, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.869989275932312, "num_tokens": 308799712.0, "step": 8096 }, { "epoch": 1.0300216257473604, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.21710205078125, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8589717745780945, "num_tokens": 308839879.0, "step": 8097 }, { "epoch": 1.030148836025951, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.95842742919922, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8507152199745178, "num_tokens": 308880362.0, "step": 8098 }, { "epoch": 1.0302760463045415, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.206974029541016, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8688631057739258, "num_tokens": 308921395.0, "step": 8099 }, { "epoch": 1.0304032565831318, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.89603042602539, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.866700291633606, "num_tokens": 308959976.0, "step": 8100 }, { "epoch": 1.0305304668617223, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.0142822265625, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8723763227462769, "num_tokens": 309000702.0, "step": 8101 }, { "epoch": 1.0306576771403129, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.800846099853516, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8773114085197449, "num_tokens": 309036540.0, "step": 8102 }, { "epoch": 1.0307848874189034, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.75553512573242, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.855937123298645, "num_tokens": 309072408.0, "step": 8103 }, { "epoch": 1.030912097697494, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.11880874633789, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8442538976669312, "num_tokens": 309110366.0, "step": 8104 }, { "epoch": 1.0310393079760845, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.7187614440918, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8611537218093872, "num_tokens": 309155194.0, "step": 8105 }, { "epoch": 1.031166518254675, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 34.77873611450195, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8667923212051392, "num_tokens": 309193747.0, "step": 8106 }, { "epoch": 1.0312937285332655, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 34.86513900756836, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8470214605331421, "num_tokens": 309234161.0, "step": 8107 }, { "epoch": 1.031420938811856, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.566131591796875, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.84773188829422, "num_tokens": 309272574.0, "step": 8108 }, { "epoch": 1.0315481490904466, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 34.808990478515625, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8607497215270996, "num_tokens": 309306567.0, "step": 8109 }, { "epoch": 1.031675359369037, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 34.79938888549805, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8625422120094299, "num_tokens": 309347218.0, "step": 8110 }, { "epoch": 1.0318025696476276, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 34.63253402709961, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8720228672027588, "num_tokens": 309385701.0, "step": 8111 }, { "epoch": 1.0319297799262181, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.02904510498047, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8575483560562134, "num_tokens": 309424600.0, "step": 8112 }, { "epoch": 1.0320569902048085, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.03254699707031, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.837670087814331, "num_tokens": 309459929.0, "step": 8113 }, { "epoch": 1.032184200483399, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.87213134765625, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8505984544754028, "num_tokens": 309505085.0, "step": 8114 }, { "epoch": 1.0323114107619895, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.15474319458008, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8612707853317261, "num_tokens": 309544746.0, "step": 8115 }, { "epoch": 1.03243862104058, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.52787780761719, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8443523645401001, "num_tokens": 309584301.0, "step": 8116 }, { "epoch": 1.0325658313191706, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.1140251159668, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.861546516418457, "num_tokens": 309622570.0, "step": 8117 }, { "epoch": 1.032693041597761, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.68726348876953, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8422219157218933, "num_tokens": 309666417.0, "step": 8118 }, { "epoch": 1.0328202518763516, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.317474365234375, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8480526208877563, "num_tokens": 309705706.0, "step": 8119 }, { "epoch": 1.0329474621549422, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 34.71416091918945, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8675981760025024, "num_tokens": 309744288.0, "step": 8120 }, { "epoch": 1.0330746724335327, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 34.7834587097168, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8454481363296509, "num_tokens": 309780406.0, "step": 8121 }, { "epoch": 1.0332018827121232, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.24407958984375, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8254455327987671, "num_tokens": 309811135.0, "step": 8122 }, { "epoch": 1.0333290929907137, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.86179733276367, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8617631196975708, "num_tokens": 309847037.0, "step": 8123 }, { "epoch": 1.0334563032693043, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.18180847167969, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8575534224510193, "num_tokens": 309884274.0, "step": 8124 }, { "epoch": 1.0335835135478946, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.01905059814453, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8704888820648193, "num_tokens": 309915572.0, "step": 8125 }, { "epoch": 1.033710723826485, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.034263610839844, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8632355332374573, "num_tokens": 309954502.0, "step": 8126 }, { "epoch": 1.0338379341050756, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.98474884033203, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8809245824813843, "num_tokens": 309993015.0, "step": 8127 }, { "epoch": 1.0339651443836662, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.739768981933594, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8497925400733948, "num_tokens": 310030840.0, "step": 8128 }, { "epoch": 1.0340923546622567, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.1741828918457, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8577572107315063, "num_tokens": 310067586.0, "step": 8129 }, { "epoch": 1.0342195649408472, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.62466049194336, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8681905269622803, "num_tokens": 310104496.0, "step": 8130 }, { "epoch": 1.0343467752194377, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.056671142578125, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8404297828674316, "num_tokens": 310141162.0, "step": 8131 }, { "epoch": 1.0344739854980283, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.99882888793945, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8562873601913452, "num_tokens": 310182965.0, "step": 8132 }, { "epoch": 1.0346011957766188, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.17858123779297, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.840625524520874, "num_tokens": 310219572.0, "step": 8133 }, { "epoch": 1.0347284060552093, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.11099624633789, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.844316303730011, "num_tokens": 310248711.0, "step": 8134 }, { "epoch": 1.0348556163337999, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.7775764465332, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8419042825698853, "num_tokens": 310287915.0, "step": 8135 }, { "epoch": 1.0349828266123904, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.43552017211914, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8565429449081421, "num_tokens": 310325619.0, "step": 8136 }, { "epoch": 1.0351100368909807, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.86929702758789, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.866429328918457, "num_tokens": 310362456.0, "step": 8137 }, { "epoch": 1.0352372471695712, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.300907135009766, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8538122177124023, "num_tokens": 310399584.0, "step": 8138 }, { "epoch": 1.0353644574481617, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.9850959777832, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8506255149841309, "num_tokens": 310437435.0, "step": 8139 }, { "epoch": 1.0354916677267523, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.26889419555664, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8647664785385132, "num_tokens": 310476320.0, "step": 8140 }, { "epoch": 1.0356188780053428, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.037315368652344, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.873913586139679, "num_tokens": 310512956.0, "step": 8141 }, { "epoch": 1.0357460882839333, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.084781646728516, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.855557918548584, "num_tokens": 310547605.0, "step": 8142 }, { "epoch": 1.0358732985625239, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.94538497924805, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.855779767036438, "num_tokens": 310586991.0, "step": 8143 }, { "epoch": 1.0360005088411144, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.13351058959961, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8571529388427734, "num_tokens": 310621271.0, "step": 8144 }, { "epoch": 1.036127719119705, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.904205322265625, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.860176682472229, "num_tokens": 310662154.0, "step": 8145 }, { "epoch": 1.0362549293982954, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.124412536621094, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8683599829673767, "num_tokens": 310701924.0, "step": 8146 }, { "epoch": 1.036382139676886, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.17277908325195, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8644520044326782, "num_tokens": 310739688.0, "step": 8147 }, { "epoch": 1.0365093499554765, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.0872802734375, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8527004718780518, "num_tokens": 310773317.0, "step": 8148 }, { "epoch": 1.0366365602340668, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.01887893676758, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8561541438102722, "num_tokens": 310810854.0, "step": 8149 }, { "epoch": 1.0367637705126573, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.721519470214844, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8505125641822815, "num_tokens": 310846080.0, "step": 8150 }, { "epoch": 1.0368909807912479, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.12283706665039, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8355056047439575, "num_tokens": 310885032.0, "step": 8151 }, { "epoch": 1.0370181910698384, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.82514572143555, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8635732531547546, "num_tokens": 310924518.0, "step": 8152 }, { "epoch": 1.037145401348429, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.15660858154297, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8601011037826538, "num_tokens": 310961734.0, "step": 8153 }, { "epoch": 1.0372726116270194, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.80050277709961, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8598971366882324, "num_tokens": 311004541.0, "step": 8154 }, { "epoch": 1.03739982190561, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.06870651245117, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8668913245201111, "num_tokens": 311043947.0, "step": 8155 }, { "epoch": 1.0375270321842005, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.1786994934082, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8548332452774048, "num_tokens": 311083154.0, "step": 8156 }, { "epoch": 1.037654242462791, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 34.760032653808594, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8704723715782166, "num_tokens": 311121429.0, "step": 8157 }, { "epoch": 1.0377814527413816, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.43735885620117, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8425159454345703, "num_tokens": 311166578.0, "step": 8158 }, { "epoch": 1.037908663019972, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.170677185058594, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8605703711509705, "num_tokens": 311209645.0, "step": 8159 }, { "epoch": 1.0380358732985626, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.647586822509766, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8612546324729919, "num_tokens": 311248428.0, "step": 8160 }, { "epoch": 1.0381630835771531, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.15107727050781, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8574272394180298, "num_tokens": 311282040.0, "step": 8161 }, { "epoch": 1.0382902938557435, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.290462493896484, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8653079271316528, "num_tokens": 311320233.0, "step": 8162 }, { "epoch": 1.038417504134334, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.84368896484375, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8436678051948547, "num_tokens": 311357586.0, "step": 8163 }, { "epoch": 1.0385447144129245, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.00279235839844, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.857315719127655, "num_tokens": 311395545.0, "step": 8164 }, { "epoch": 1.038671924691515, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.61185836791992, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8577909469604492, "num_tokens": 311428785.0, "step": 8165 }, { "epoch": 1.0387991349701056, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 36.181556701660156, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8468309640884399, "num_tokens": 311469546.0, "step": 8166 }, { "epoch": 1.038926345248696, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.01038360595703, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8498101830482483, "num_tokens": 311504064.0, "step": 8167 }, { "epoch": 1.0390535555272866, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.92814254760742, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8408225774765015, "num_tokens": 311536600.0, "step": 8168 }, { "epoch": 1.0391807658058771, "ewc_loss": 0.10302734375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.487701416015625e-05, "grad_norm": 35.171905517578125, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8740585446357727, "num_tokens": 311573638.0, "step": 8169 }, { "epoch": 1.0393079760844677, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.408573150634766, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8512498140335083, "num_tokens": 311611085.0, "step": 8170 }, { "epoch": 1.0394351863630582, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.22337341308594, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8515208959579468, "num_tokens": 311655497.0, "step": 8171 }, { "epoch": 1.0395623966416487, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.57429885864258, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8432953357696533, "num_tokens": 311688167.0, "step": 8172 }, { "epoch": 1.0396896069202393, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.350799560546875, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8566268682479858, "num_tokens": 311724926.0, "step": 8173 }, { "epoch": 1.0398168171988296, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.55535125732422, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8394362926483154, "num_tokens": 311760729.0, "step": 8174 }, { "epoch": 1.03994402747742, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.23727798461914, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8588519096374512, "num_tokens": 311802138.0, "step": 8175 }, { "epoch": 1.0400712377560106, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.75872039794922, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8649857044219971, "num_tokens": 311838510.0, "step": 8176 }, { "epoch": 1.0401984480346012, "ewc_loss": 0.103515625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.535385131835938e-05, "grad_norm": 35.33576202392578, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8505045175552368, "num_tokens": 311870875.0, "step": 8177 }, { "epoch": 1.0403256583131917, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.6640625, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8705462217330933, "num_tokens": 311905854.0, "step": 8178 }, { "epoch": 1.0404528685917822, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.322509765625, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8543235063552856, "num_tokens": 311936381.0, "step": 8179 }, { "epoch": 1.0405800788703727, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.43638229370117, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8517471551895142, "num_tokens": 311972529.0, "step": 8180 }, { "epoch": 1.0407072891489633, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.23484420776367, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8774821162223816, "num_tokens": 312007883.0, "step": 8181 }, { "epoch": 1.0408344994275538, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.0554313659668, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8537176847457886, "num_tokens": 312041000.0, "step": 8182 }, { "epoch": 1.0409617097061443, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.38666915893555, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.860938310623169, "num_tokens": 312076011.0, "step": 8183 }, { "epoch": 1.0410889199847349, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.982269287109375, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8519818782806396, "num_tokens": 312115614.0, "step": 8184 }, { "epoch": 1.0412161302633254, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.42524337768555, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.850588858127594, "num_tokens": 312154111.0, "step": 8185 }, { "epoch": 1.0413433405419157, "ewc_loss": 0.10400390625, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 34.82859802246094, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8512415885925293, "num_tokens": 312191571.0, "step": 8186 }, { "epoch": 1.0414705508205062, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.20840072631836, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8511505722999573, "num_tokens": 312229205.0, "step": 8187 }, { "epoch": 1.0415977610990967, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.48698425292969, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8736550807952881, "num_tokens": 312266604.0, "step": 8188 }, { "epoch": 1.0417249713776873, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.80061721801758, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.846222996711731, "num_tokens": 312305222.0, "step": 8189 }, { "epoch": 1.0418521816562778, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.89783477783203, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8348273038864136, "num_tokens": 312345823.0, "step": 8190 }, { "epoch": 1.0419793919348683, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.01607131958008, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8469946980476379, "num_tokens": 312383569.0, "step": 8191 }, { "epoch": 1.0421066022134589, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.21455764770508, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8607256412506104, "num_tokens": 312423068.0, "step": 8192 }, { "epoch": 1.0422338124920494, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.85557556152344, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8522636890411377, "num_tokens": 312460569.0, "step": 8193 }, { "epoch": 1.04236102277064, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.14345932006836, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8365539908409119, "num_tokens": 312501654.0, "step": 8194 }, { "epoch": 1.0424882330492304, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.013397216796875, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8495458960533142, "num_tokens": 312543630.0, "step": 8195 }, { "epoch": 1.042615443327821, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.467926025390625, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8558579683303833, "num_tokens": 312579428.0, "step": 8196 }, { "epoch": 1.0427426536064115, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.94865036010742, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8711504340171814, "num_tokens": 312621388.0, "step": 8197 }, { "epoch": 1.0428698638850018, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.49803924560547, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8585779070854187, "num_tokens": 312665303.0, "step": 8198 }, { "epoch": 1.0429970741635923, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.55979537963867, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8472902774810791, "num_tokens": 312702354.0, "step": 8199 }, { "epoch": 1.0431242844421829, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.06717300415039, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8628942966461182, "num_tokens": 312737223.0, "step": 8200 }, { "epoch": 1.0432514947207734, "ewc_loss": 0.10498046875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.583740234375, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8592507243156433, "num_tokens": 312773353.0, "step": 8201 }, { "epoch": 1.043378704999364, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.908851623535156, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8384883403778076, "num_tokens": 312809230.0, "step": 8202 }, { "epoch": 1.0435059152779544, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.560630798339844, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8749388456344604, "num_tokens": 312841278.0, "step": 8203 }, { "epoch": 1.043633125556545, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.289451599121094, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8625478148460388, "num_tokens": 312888469.0, "step": 8204 }, { "epoch": 1.0437603358351355, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.86404037475586, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8661645650863647, "num_tokens": 312925446.0, "step": 8205 }, { "epoch": 1.043887546113726, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.011234283447266, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8556538820266724, "num_tokens": 312965425.0, "step": 8206 }, { "epoch": 1.0440147563923166, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.84691619873047, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8503534197807312, "num_tokens": 313000637.0, "step": 8207 }, { "epoch": 1.044141966670907, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 34.863616943359375, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8568885326385498, "num_tokens": 313035986.0, "step": 8208 }, { "epoch": 1.0442691769494976, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.540042877197266, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8462783098220825, "num_tokens": 313077027.0, "step": 8209 }, { "epoch": 1.0443963872280881, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.58306884765625e-05, "grad_norm": 35.24888610839844, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.866634726524353, "num_tokens": 313121733.0, "step": 8210 }, { "epoch": 1.0445235975066784, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.310855865478516, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8505464196205139, "num_tokens": 313165542.0, "step": 8211 }, { "epoch": 1.044650807785269, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.363563537597656, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8464117050170898, "num_tokens": 313201082.0, "step": 8212 }, { "epoch": 1.0447780180638595, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.536415100097656, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.868021547794342, "num_tokens": 313241765.0, "step": 8213 }, { "epoch": 1.04490522834245, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 34.995811462402344, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8545376062393188, "num_tokens": 313282471.0, "step": 8214 }, { "epoch": 1.0450324386210406, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.32197570800781, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8530948162078857, "num_tokens": 313331505.0, "step": 8215 }, { "epoch": 1.045159648899631, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.300899505615234, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8577471375465393, "num_tokens": 313369823.0, "step": 8216 }, { "epoch": 1.0452868591782216, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.15926742553711, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.858036458492279, "num_tokens": 313404255.0, "step": 8217 }, { "epoch": 1.0454140694568121, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.278663635253906, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.868729293346405, "num_tokens": 313439368.0, "step": 8218 }, { "epoch": 1.0455412797354027, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.414146423339844, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8481636047363281, "num_tokens": 313479475.0, "step": 8219 }, { "epoch": 1.0456684900139932, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.85835266113281, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8487733602523804, "num_tokens": 313514604.0, "step": 8220 }, { "epoch": 1.0457957002925837, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.090736389160156, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8541150093078613, "num_tokens": 313548907.0, "step": 8221 }, { "epoch": 1.0459229105711743, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.21013641357422, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8562382459640503, "num_tokens": 313586615.0, "step": 8222 }, { "epoch": 1.0460501208497646, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.50367736816406, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8467264175415039, "num_tokens": 313625202.0, "step": 8223 }, { "epoch": 1.046177331128355, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 34.902137756347656, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8502235412597656, "num_tokens": 313663257.0, "step": 8224 }, { "epoch": 1.0463045414069456, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.611053466796875, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8614323139190674, "num_tokens": 313701430.0, "step": 8225 }, { "epoch": 1.0464317516855361, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.125343322753906, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8670676946640015, "num_tokens": 313741286.0, "step": 8226 }, { "epoch": 1.0465589619641267, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.506290435791016, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8421935439109802, "num_tokens": 313779115.0, "step": 8227 }, { "epoch": 1.0466861722427172, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.55833435058594, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8628225326538086, "num_tokens": 313822331.0, "step": 8228 }, { "epoch": 1.0468133825213077, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.078529357910156, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8511359095573425, "num_tokens": 313863545.0, "step": 8229 }, { "epoch": 1.0469405927998983, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.432167053222656, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8781148195266724, "num_tokens": 313894956.0, "step": 8230 }, { "epoch": 1.0470678030784888, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.44476318359375, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8659934401512146, "num_tokens": 313927920.0, "step": 8231 }, { "epoch": 1.0471950133570793, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.049034118652344, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8655335903167725, "num_tokens": 313963230.0, "step": 8232 }, { "epoch": 1.0473222236356698, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.505149841308594, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8481348156929016, "num_tokens": 313995951.0, "step": 8233 }, { "epoch": 1.0474494339142604, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.15925979614258, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8578660488128662, "num_tokens": 314040928.0, "step": 8234 }, { "epoch": 1.0475766441928507, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.18239974975586, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8492980003356934, "num_tokens": 314080329.0, "step": 8235 }, { "epoch": 1.0477038544714412, "ewc_loss": 0.10595703125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.34136199951172, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.857792317867279, "num_tokens": 314117107.0, "step": 8236 }, { "epoch": 1.0478310647500317, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.451332092285156, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8540357351303101, "num_tokens": 314148077.0, "step": 8237 }, { "epoch": 1.0479582750286223, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.09046173095703, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8492720127105713, "num_tokens": 314182064.0, "step": 8238 }, { "epoch": 1.0480854853072128, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.82737731933594, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8473413586616516, "num_tokens": 314221667.0, "step": 8239 }, { "epoch": 1.0482126955858033, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.080081939697266, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.853705883026123, "num_tokens": 314255225.0, "step": 8240 }, { "epoch": 1.0483399058643939, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.78707504272461, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8586148023605347, "num_tokens": 314288011.0, "step": 8241 }, { "epoch": 1.0484671161429844, "ewc_loss": 0.1044921875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.630752563476562e-05, "grad_norm": 35.19024658203125, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8545718193054199, "num_tokens": 314332461.0, "step": 8242 }, { "epoch": 1.048594326421575, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.300880432128906, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8522642850875854, "num_tokens": 314367967.0, "step": 8243 }, { "epoch": 1.0487215367001654, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 34.99831008911133, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8661596179008484, "num_tokens": 314404487.0, "step": 8244 }, { "epoch": 1.048848746978756, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.543609619140625, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8554092645645142, "num_tokens": 314444731.0, "step": 8245 }, { "epoch": 1.0489759572573465, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.405513763427734, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8566312789916992, "num_tokens": 314479720.0, "step": 8246 }, { "epoch": 1.0491031675359368, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.61930465698242, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.874280571937561, "num_tokens": 314515613.0, "step": 8247 }, { "epoch": 1.0492303778145273, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.188819885253906, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.855110764503479, "num_tokens": 314556519.0, "step": 8248 }, { "epoch": 1.0493575880931179, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.795013427734375, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8684787750244141, "num_tokens": 314590803.0, "step": 8249 }, { "epoch": 1.0494847983717084, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.29417037963867, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8588712215423584, "num_tokens": 314631347.0, "step": 8250 }, { "epoch": 1.049612008650299, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.61906814575195, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8554158806800842, "num_tokens": 314673739.0, "step": 8251 }, { "epoch": 1.0497392189288894, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.233882904052734, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8590958714485168, "num_tokens": 314718498.0, "step": 8252 }, { "epoch": 1.04986642920748, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.427711486816406, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8627901077270508, "num_tokens": 314758755.0, "step": 8253 }, { "epoch": 1.0499936394860705, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.377437591552734, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8612471222877502, "num_tokens": 314793644.0, "step": 8254 }, { "epoch": 1.050120849764661, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.51760482788086, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8538273572921753, "num_tokens": 314828045.0, "step": 8255 }, { "epoch": 1.0502480600432516, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.31351089477539, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8598924279212952, "num_tokens": 314866088.0, "step": 8256 }, { "epoch": 1.050375270321842, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.954933166503906, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.848834216594696, "num_tokens": 314904757.0, "step": 8257 }, { "epoch": 1.0505024806004326, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.00898361206055, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8489001989364624, "num_tokens": 314941410.0, "step": 8258 }, { "epoch": 1.0506296908790231, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.702999114990234, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8654926419258118, "num_tokens": 314982060.0, "step": 8259 }, { "epoch": 1.0507569011576134, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.09861373901367, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8483926653862, "num_tokens": 315021812.0, "step": 8260 }, { "epoch": 1.050884111436204, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.487003326416016, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8532835245132446, "num_tokens": 315059520.0, "step": 8261 }, { "epoch": 1.0510113217147945, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.1015625, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8518283367156982, "num_tokens": 315098392.0, "step": 8262 }, { "epoch": 1.051138531993385, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.493003845214844, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8460217118263245, "num_tokens": 315137375.0, "step": 8263 }, { "epoch": 1.0512657422719756, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.10911178588867, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8609660267829895, "num_tokens": 315171308.0, "step": 8264 }, { "epoch": 1.051392952550566, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.41664123535156, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8588753938674927, "num_tokens": 315212176.0, "step": 8265 }, { "epoch": 1.0515201628291566, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.49440002441406, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8452599048614502, "num_tokens": 315251321.0, "step": 8266 }, { "epoch": 1.0516473731077471, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.2049674987793, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8421446084976196, "num_tokens": 315288780.0, "step": 8267 }, { "epoch": 1.0517745833863377, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.58802032470703, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8809803128242493, "num_tokens": 315325630.0, "step": 8268 }, { "epoch": 1.0519017936649282, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.284271240234375, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8520762920379639, "num_tokens": 315362887.0, "step": 8269 }, { "epoch": 1.0520290039435187, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.863346099853516, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8694157004356384, "num_tokens": 315397552.0, "step": 8270 }, { "epoch": 1.0521562142221093, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.08504867553711, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8668106198310852, "num_tokens": 315436785.0, "step": 8271 }, { "epoch": 1.0522834245006996, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.78498458862305, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8669837713241577, "num_tokens": 315467849.0, "step": 8272 }, { "epoch": 1.05241063477929, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.23187255859375, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8658090233802795, "num_tokens": 315501496.0, "step": 8273 }, { "epoch": 1.0525378450578806, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.78926467895508, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8663077354431152, "num_tokens": 315540878.0, "step": 8274 }, { "epoch": 1.0526650553364711, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.268306732177734, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8713433146476746, "num_tokens": 315578139.0, "step": 8275 }, { "epoch": 1.0527922656150617, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.73875427246094, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8596779704093933, "num_tokens": 315615845.0, "step": 8276 }, { "epoch": 1.0529194758936522, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.43992614746094, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8477545976638794, "num_tokens": 315645407.0, "step": 8277 }, { "epoch": 1.0530466861722427, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.47795486450195, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8640249371528625, "num_tokens": 315687922.0, "step": 8278 }, { "epoch": 1.0531738964508333, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.03315353393555, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.859890341758728, "num_tokens": 315723730.0, "step": 8279 }, { "epoch": 1.0533011067294238, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.70703887939453, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8687862753868103, "num_tokens": 315765662.0, "step": 8280 }, { "epoch": 1.0534283170080143, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.50870132446289, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8329373598098755, "num_tokens": 315808668.0, "step": 8281 }, { "epoch": 1.0535555272866048, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.309329986572266, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8541555404663086, "num_tokens": 315844878.0, "step": 8282 }, { "epoch": 1.0536827375651954, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.55117416381836, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8713251948356628, "num_tokens": 315879472.0, "step": 8283 }, { "epoch": 1.0538099478437857, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.19865036010742, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8509323596954346, "num_tokens": 315915496.0, "step": 8284 }, { "epoch": 1.0539371581223762, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.57621383666992, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8602328300476074, "num_tokens": 315954317.0, "step": 8285 }, { "epoch": 1.0540643684009667, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.5593147277832, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8518242835998535, "num_tokens": 315995518.0, "step": 8286 }, { "epoch": 1.0541915786795573, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.20131301879883, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8709739446640015, "num_tokens": 316040475.0, "step": 8287 }, { "epoch": 1.0543187889581478, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.49468231201172, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8564702868461609, "num_tokens": 316083003.0, "step": 8288 }, { "epoch": 1.0544459992367383, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.678436279296875e-05, "grad_norm": 35.28603744506836, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8586795330047607, "num_tokens": 316124687.0, "step": 8289 }, { "epoch": 1.0545732095153288, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.63724899291992, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8434115052223206, "num_tokens": 316170052.0, "step": 8290 }, { "epoch": 1.0547004197939194, "ewc_loss": 0.10546875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.726119995117188e-05, "grad_norm": 35.38883590698242, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8426694869995117, "num_tokens": 316211941.0, "step": 8291 }, { "epoch": 1.05482763007251, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.37568664550781, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8611142039299011, "num_tokens": 316252592.0, "step": 8292 }, { "epoch": 1.0549548403511004, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.53870391845703, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8650882840156555, "num_tokens": 316287250.0, "step": 8293 }, { "epoch": 1.055082050629691, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.4494743347168, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8609862327575684, "num_tokens": 316326308.0, "step": 8294 }, { "epoch": 1.0552092609082815, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.57142639160156, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.86058109998703, "num_tokens": 316364519.0, "step": 8295 }, { "epoch": 1.0553364711868718, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.15131759643555, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8633551597595215, "num_tokens": 316401165.0, "step": 8296 }, { "epoch": 1.0554636814654623, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.67472457885742, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8516959547996521, "num_tokens": 316440245.0, "step": 8297 }, { "epoch": 1.0555908917440529, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.313255310058594, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8662679195404053, "num_tokens": 316484969.0, "step": 8298 }, { "epoch": 1.0557181020226434, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.261260986328125, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8764976263046265, "num_tokens": 316520510.0, "step": 8299 }, { "epoch": 1.055845312301234, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.734764099121094, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8415437936782837, "num_tokens": 316561946.0, "step": 8300 }, { "epoch": 1.0559725225798244, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 34.815982818603516, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8753883242607117, "num_tokens": 316605489.0, "step": 8301 }, { "epoch": 1.056099732858415, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.89215850830078, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8416762948036194, "num_tokens": 316648590.0, "step": 8302 }, { "epoch": 1.0562269431370055, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.08787155151367, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8710788488388062, "num_tokens": 316688169.0, "step": 8303 }, { "epoch": 1.056354153415596, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.80887985229492, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8351608514785767, "num_tokens": 316724031.0, "step": 8304 }, { "epoch": 1.0564813636941865, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.20819091796875, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8573283553123474, "num_tokens": 316769762.0, "step": 8305 }, { "epoch": 1.056608573972777, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.57344436645508, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8662993311882019, "num_tokens": 316814156.0, "step": 8306 }, { "epoch": 1.0567357842513676, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.3769416809082, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.868014931678772, "num_tokens": 316852784.0, "step": 8307 }, { "epoch": 1.0568629945299581, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.431583404541016, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8513444662094116, "num_tokens": 316891992.0, "step": 8308 }, { "epoch": 1.0569902048085484, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.28348159790039, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8719415068626404, "num_tokens": 316932246.0, "step": 8309 }, { "epoch": 1.057117415087139, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.1854362487793, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8501657247543335, "num_tokens": 316972017.0, "step": 8310 }, { "epoch": 1.0572446253657295, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.6539306640625, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8521497249603271, "num_tokens": 317008631.0, "step": 8311 }, { "epoch": 1.05737183564432, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.36875915527344, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8627989888191223, "num_tokens": 317048616.0, "step": 8312 }, { "epoch": 1.0574990459229106, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.36405563354492, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8495641350746155, "num_tokens": 317088675.0, "step": 8313 }, { "epoch": 1.057626256201501, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.3570442199707, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8793321251869202, "num_tokens": 317120351.0, "step": 8314 }, { "epoch": 1.0577534664800916, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.16317367553711, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8510411977767944, "num_tokens": 317159027.0, "step": 8315 }, { "epoch": 1.0578806767586821, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.870025634765625, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8569926619529724, "num_tokens": 317188892.0, "step": 8316 }, { "epoch": 1.0580078870372727, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.18126678466797, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8632228374481201, "num_tokens": 317229026.0, "step": 8317 }, { "epoch": 1.0581350973158632, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.627052307128906, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8574225306510925, "num_tokens": 317272946.0, "step": 8318 }, { "epoch": 1.0582623075944537, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.2463264465332, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8565647602081299, "num_tokens": 317314625.0, "step": 8319 }, { "epoch": 1.058389517873044, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.62876892089844, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8680263757705688, "num_tokens": 317351450.0, "step": 8320 }, { "epoch": 1.0585167281516346, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.84142303466797, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.840997040271759, "num_tokens": 317391519.0, "step": 8321 }, { "epoch": 1.058643938430225, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.65442657470703, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8512256145477295, "num_tokens": 317430672.0, "step": 8322 }, { "epoch": 1.0587711487088156, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.37047576904297, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8583202362060547, "num_tokens": 317470203.0, "step": 8323 }, { "epoch": 1.0588983589874061, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.564537048339844, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.860088586807251, "num_tokens": 317509594.0, "step": 8324 }, { "epoch": 1.0590255692659967, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.41130828857422, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8557937145233154, "num_tokens": 317549735.0, "step": 8325 }, { "epoch": 1.0591527795445872, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.492271423339844, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8554465174674988, "num_tokens": 317585064.0, "step": 8326 }, { "epoch": 1.0592799898231777, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.4905891418457, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8732473850250244, "num_tokens": 317622289.0, "step": 8327 }, { "epoch": 1.0594072001017683, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.59331512451172, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8465043306350708, "num_tokens": 317662872.0, "step": 8328 }, { "epoch": 1.0595344103803588, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.64470672607422, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8585572838783264, "num_tokens": 317701418.0, "step": 8329 }, { "epoch": 1.0596616206589493, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.422794342041016, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.871452808380127, "num_tokens": 317741416.0, "step": 8330 }, { "epoch": 1.0597888309375398, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.58756637573242, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8624827265739441, "num_tokens": 317778623.0, "step": 8331 }, { "epoch": 1.0599160412161304, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.65420913696289, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.859216034412384, "num_tokens": 317816577.0, "step": 8332 }, { "epoch": 1.0600432514947207, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.47855758666992, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8567289113998413, "num_tokens": 317851532.0, "step": 8333 }, { "epoch": 1.0601704617733112, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.59238052368164, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8582574129104614, "num_tokens": 317886789.0, "step": 8334 }, { "epoch": 1.0602976720519017, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.462345123291016, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8610590100288391, "num_tokens": 317929511.0, "step": 8335 }, { "epoch": 1.0604248823304923, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.32571029663086, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8542936444282532, "num_tokens": 317963767.0, "step": 8336 }, { "epoch": 1.0605520926090828, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.13017654418945, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8738120198249817, "num_tokens": 318000458.0, "step": 8337 }, { "epoch": 1.0606793028876733, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.59724044799805, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8745949268341064, "num_tokens": 318040394.0, "step": 8338 }, { "epoch": 1.0608065131662638, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.36662292480469, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8545337319374084, "num_tokens": 318080859.0, "step": 8339 }, { "epoch": 1.0609337234448544, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.56052017211914, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8754482865333557, "num_tokens": 318115378.0, "step": 8340 }, { "epoch": 1.061060933723445, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.50800323486328, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8503365516662598, "num_tokens": 318150351.0, "step": 8341 }, { "epoch": 1.0611881440020354, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.374794006347656, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8580266237258911, "num_tokens": 318192427.0, "step": 8342 }, { "epoch": 1.061315354280626, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.76518630981445, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8460888862609863, "num_tokens": 318232150.0, "step": 8343 }, { "epoch": 1.0614425645592165, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.33894348144531, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8541883230209351, "num_tokens": 318267742.0, "step": 8344 }, { "epoch": 1.0615697748378068, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.61384963989258, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8621945381164551, "num_tokens": 318307974.0, "step": 8345 }, { "epoch": 1.0616969851163973, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.30513000488281, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8402748703956604, "num_tokens": 318349880.0, "step": 8346 }, { "epoch": 1.0618241953949878, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.81113815307617, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8551374077796936, "num_tokens": 318383869.0, "step": 8347 }, { "epoch": 1.0619514056735784, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.32883071899414, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8729790449142456, "num_tokens": 318419659.0, "step": 8348 }, { "epoch": 1.062078615952169, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.453025817871094, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.866217851638794, "num_tokens": 318460795.0, "step": 8349 }, { "epoch": 1.0622058262307594, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.58402633666992, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8487867116928101, "num_tokens": 318499451.0, "step": 8350 }, { "epoch": 1.06233303650935, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.223182678222656, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.863954484462738, "num_tokens": 318533413.0, "step": 8351 }, { "epoch": 1.0624602467879405, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.77326965332031, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.848060667514801, "num_tokens": 318572533.0, "step": 8352 }, { "epoch": 1.062587457066531, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.09605026245117, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.860451340675354, "num_tokens": 318609442.0, "step": 8353 }, { "epoch": 1.0627146673451215, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.612640380859375, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8729556202888489, "num_tokens": 318643475.0, "step": 8354 }, { "epoch": 1.062841877623712, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.304779052734375, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8617363572120667, "num_tokens": 318684223.0, "step": 8355 }, { "epoch": 1.0629690879023026, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.419944763183594, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8586620092391968, "num_tokens": 318717148.0, "step": 8356 }, { "epoch": 1.0630962981808931, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.543548583984375, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8706111907958984, "num_tokens": 318758929.0, "step": 8357 }, { "epoch": 1.0632235084594834, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.32659149169922, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8548686504364014, "num_tokens": 318796438.0, "step": 8358 }, { "epoch": 1.063350718738074, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.465370178222656, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8447978496551514, "num_tokens": 318835732.0, "step": 8359 }, { "epoch": 1.0634779290166645, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.536773681640625, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8565282821655273, "num_tokens": 318872151.0, "step": 8360 }, { "epoch": 1.063605139295255, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.329505920410156, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8470901250839233, "num_tokens": 318911956.0, "step": 8361 }, { "epoch": 1.0637323495738455, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.572906494140625, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8595633506774902, "num_tokens": 318945593.0, "step": 8362 }, { "epoch": 1.063859559852436, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.58736038208008, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.868441641330719, "num_tokens": 318976670.0, "step": 8363 }, { "epoch": 1.0639867701310266, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.70016860961914, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8621413707733154, "num_tokens": 319007798.0, "step": 8364 }, { "epoch": 1.0641139804096171, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.328773498535156, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8479577302932739, "num_tokens": 319044808.0, "step": 8365 }, { "epoch": 1.0642411906882077, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.987972259521484, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8792605996131897, "num_tokens": 319083387.0, "step": 8366 }, { "epoch": 1.0643684009667982, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.15342712402344, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8749439120292664, "num_tokens": 319121897.0, "step": 8367 }, { "epoch": 1.0644956112453887, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.66465759277344, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8457484841346741, "num_tokens": 319157190.0, "step": 8368 }, { "epoch": 1.064622821523979, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.55330276489258, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8681027889251709, "num_tokens": 319188353.0, "step": 8369 }, { "epoch": 1.0647500318025696, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.58124923706055, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8619125485420227, "num_tokens": 319225536.0, "step": 8370 }, { "epoch": 1.06487724208116, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.742698669433594, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8644806146621704, "num_tokens": 319265114.0, "step": 8371 }, { "epoch": 1.0650044523597506, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.43846130371094, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8689946532249451, "num_tokens": 319309280.0, "step": 8372 }, { "epoch": 1.0651316626383411, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.83402633666992, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8693059682846069, "num_tokens": 319346114.0, "step": 8373 }, { "epoch": 1.0652588729169317, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.56083297729492, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8667908310890198, "num_tokens": 319389001.0, "step": 8374 }, { "epoch": 1.0653860831955222, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.58174514770508, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8666008710861206, "num_tokens": 319431464.0, "step": 8375 }, { "epoch": 1.0655132934741127, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.68086242675781, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8340572714805603, "num_tokens": 319475638.0, "step": 8376 }, { "epoch": 1.0656405037527032, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.5839729309082, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8697113394737244, "num_tokens": 319515983.0, "step": 8377 }, { "epoch": 1.0657677140312938, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.62349319458008, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8545486330986023, "num_tokens": 319552152.0, "step": 8378 }, { "epoch": 1.0658949243098843, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.71061706542969, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8510062098503113, "num_tokens": 319592963.0, "step": 8379 }, { "epoch": 1.0660221345884748, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.57966613769531, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8448773622512817, "num_tokens": 319635136.0, "step": 8380 }, { "epoch": 1.0661493448670654, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.46841049194336, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8620303869247437, "num_tokens": 319670118.0, "step": 8381 }, { "epoch": 1.0662765551456557, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.490997314453125, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8646798729896545, "num_tokens": 319711057.0, "step": 8382 }, { "epoch": 1.0664037654242462, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.61766815185547, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8564350008964539, "num_tokens": 319746451.0, "step": 8383 }, { "epoch": 1.0665309757028367, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.87400436401367, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8519624471664429, "num_tokens": 319792149.0, "step": 8384 }, { "epoch": 1.0666581859814273, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.98957443237305, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8516063690185547, "num_tokens": 319826508.0, "step": 8385 }, { "epoch": 1.0667853962600178, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.62581253051758, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8616064190864563, "num_tokens": 319867000.0, "step": 8386 }, { "epoch": 1.0669126065386083, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.63148498535156, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8412731289863586, "num_tokens": 319904476.0, "step": 8387 }, { "epoch": 1.0670398168171988, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.5244255065918, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8344349265098572, "num_tokens": 319943279.0, "step": 8388 }, { "epoch": 1.0671670270957894, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.87751388549805, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8542349338531494, "num_tokens": 319986610.0, "step": 8389 }, { "epoch": 1.06729423737438, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.44269943237305, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8516504764556885, "num_tokens": 320022702.0, "step": 8390 }, { "epoch": 1.0674214476529704, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.53475570678711, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8607449531555176, "num_tokens": 320067918.0, "step": 8391 }, { "epoch": 1.067548657931561, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.428924560546875, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.851676344871521, "num_tokens": 320110455.0, "step": 8392 }, { "epoch": 1.0676758682101515, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.837913513183594, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8754934072494507, "num_tokens": 320147675.0, "step": 8393 }, { "epoch": 1.0678030784887418, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 36.370628356933594, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8441328406333923, "num_tokens": 320183882.0, "step": 8394 }, { "epoch": 1.0679302887673323, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 37.184513092041016, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8479403853416443, "num_tokens": 320222215.0, "step": 8395 }, { "epoch": 1.0680574990459228, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 36.46287536621094, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8503068089485168, "num_tokens": 320262418.0, "step": 8396 }, { "epoch": 1.0681847093245134, "ewc_loss": 0.1064453125, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.7738037109375e-05, "grad_norm": 35.844356536865234, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8519386053085327, "num_tokens": 320297304.0, "step": 8397 }, { "epoch": 1.068311919603104, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.3409538269043, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8604416847229004, "num_tokens": 320330919.0, "step": 8398 }, { "epoch": 1.0684391298816944, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 36.297393798828125, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8588758707046509, "num_tokens": 320367602.0, "step": 8399 }, { "epoch": 1.068566340160285, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.37137222290039, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8584104180335999, "num_tokens": 320403875.0, "step": 8400 }, { "epoch": 1.0686935504388755, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.923580169677734, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8616935014724731, "num_tokens": 320439149.0, "step": 8401 }, { "epoch": 1.068820760717466, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.84900665283203, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8704801797866821, "num_tokens": 320476786.0, "step": 8402 }, { "epoch": 1.0689479709960565, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.556522369384766, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.843940019607544, "num_tokens": 320516945.0, "step": 8403 }, { "epoch": 1.069075181274647, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.662052154541016, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8731106519699097, "num_tokens": 320552801.0, "step": 8404 }, { "epoch": 1.0692023915532376, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.24776077270508, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8605671525001526, "num_tokens": 320593605.0, "step": 8405 }, { "epoch": 1.0693296018318281, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.820648193359375, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8667533993721008, "num_tokens": 320624476.0, "step": 8406 }, { "epoch": 1.0694568121104184, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.319149017333984, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8643798828125, "num_tokens": 320667000.0, "step": 8407 }, { "epoch": 1.069584022389009, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.77111053466797, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.864047646522522, "num_tokens": 320699286.0, "step": 8408 }, { "epoch": 1.0697112326675995, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.68741989135742, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8648461103439331, "num_tokens": 320732520.0, "step": 8409 }, { "epoch": 1.06983844294619, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.23582077026367, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8463480472564697, "num_tokens": 320773425.0, "step": 8410 }, { "epoch": 1.0699656532247805, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.609619140625, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8361943364143372, "num_tokens": 320810859.0, "step": 8411 }, { "epoch": 1.070092863503371, "ewc_loss": 0.10693359375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.821487426757812e-05, "grad_norm": 35.554935455322266, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8619971871376038, "num_tokens": 320850266.0, "step": 8412 }, { "epoch": 1.0702200737819616, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.314537048339844, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8575692176818848, "num_tokens": 320892812.0, "step": 8413 }, { "epoch": 1.0703472840605521, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.530067443847656, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8704972863197327, "num_tokens": 320929563.0, "step": 8414 }, { "epoch": 1.0704744943391427, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.49455261230469, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.865850567817688, "num_tokens": 320964010.0, "step": 8415 }, { "epoch": 1.0706017046177332, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.66253662109375, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8558542728424072, "num_tokens": 321006405.0, "step": 8416 }, { "epoch": 1.0707289148963237, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.48470687866211, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8526997566223145, "num_tokens": 321050146.0, "step": 8417 }, { "epoch": 1.070856125174914, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.93620681762695, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.857619047164917, "num_tokens": 321086857.0, "step": 8418 }, { "epoch": 1.0709833354535045, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.84288024902344, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8634858131408691, "num_tokens": 321127310.0, "step": 8419 }, { "epoch": 1.071110545732095, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.193172454833984, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8484387397766113, "num_tokens": 321165881.0, "step": 8420 }, { "epoch": 1.0712377560106856, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.75482940673828, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8511563539505005, "num_tokens": 321204494.0, "step": 8421 }, { "epoch": 1.0713649662892761, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.35099411010742, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8475747108459473, "num_tokens": 321243639.0, "step": 8422 }, { "epoch": 1.0714921765678667, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.681640625, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8487579822540283, "num_tokens": 321281399.0, "step": 8423 }, { "epoch": 1.0716193868464572, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.304901123046875, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8438947200775146, "num_tokens": 321316539.0, "step": 8424 }, { "epoch": 1.0717465971250477, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.79511260986328, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8546737432479858, "num_tokens": 321348618.0, "step": 8425 }, { "epoch": 1.0718738074036382, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.58423614501953, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8632675409317017, "num_tokens": 321381674.0, "step": 8426 }, { "epoch": 1.0720010176822288, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.36785125732422, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8731550574302673, "num_tokens": 321420342.0, "step": 8427 }, { "epoch": 1.0721282279608193, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.95232391357422, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8538256883621216, "num_tokens": 321450770.0, "step": 8428 }, { "epoch": 1.0722554382394098, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.10696029663086, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8613862991333008, "num_tokens": 321491450.0, "step": 8429 }, { "epoch": 1.0723826485180004, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.975311279296875, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8521926999092102, "num_tokens": 321531774.0, "step": 8430 }, { "epoch": 1.0725098587965907, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.1252326965332, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.857697606086731, "num_tokens": 321567692.0, "step": 8431 }, { "epoch": 1.0726370690751812, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.76105499267578, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8535143733024597, "num_tokens": 321601168.0, "step": 8432 }, { "epoch": 1.0727642793537717, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.59803771972656, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8412753343582153, "num_tokens": 321636100.0, "step": 8433 }, { "epoch": 1.0728914896323622, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.90447998046875, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8582923412322998, "num_tokens": 321674365.0, "step": 8434 }, { "epoch": 1.0730186999109528, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.334957122802734, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8689202666282654, "num_tokens": 321712267.0, "step": 8435 }, { "epoch": 1.0731459101895433, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.851619720458984, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8558982014656067, "num_tokens": 321749955.0, "step": 8436 }, { "epoch": 1.0732731204681338, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.544979095458984, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8658018112182617, "num_tokens": 321788522.0, "step": 8437 }, { "epoch": 1.0734003307467244, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.85037612915039, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8558889031410217, "num_tokens": 321826314.0, "step": 8438 }, { "epoch": 1.073527541025315, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.45401382446289, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8549474477767944, "num_tokens": 321864851.0, "step": 8439 }, { "epoch": 1.0736547513039054, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.7056884765625, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8670918345451355, "num_tokens": 321901001.0, "step": 8440 }, { "epoch": 1.073781961582496, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.843223571777344, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8558322191238403, "num_tokens": 321941368.0, "step": 8441 }, { "epoch": 1.0739091718610865, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.57574462890625, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8606318235397339, "num_tokens": 321978268.0, "step": 8442 }, { "epoch": 1.0740363821396768, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 36.018951416015625, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8655895590782166, "num_tokens": 322016969.0, "step": 8443 }, { "epoch": 1.0741635924182673, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.105838775634766, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8509823083877563, "num_tokens": 322059362.0, "step": 8444 }, { "epoch": 1.0742908026968578, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 36.029170989990234, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8381966352462769, "num_tokens": 322097334.0, "step": 8445 }, { "epoch": 1.0744180129754484, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.21164321899414, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8786765336990356, "num_tokens": 322121988.0, "step": 8446 }, { "epoch": 1.074545223254039, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 36.64258575439453, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8528838157653809, "num_tokens": 322156602.0, "step": 8447 }, { "epoch": 1.0746724335326294, "ewc_loss": 0.107421875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 8.869171142578125e-05, "grad_norm": 35.298404693603516, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8544901609420776, "num_tokens": 322194475.0, "step": 8448 }, { "epoch": 1.07479964381122, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.31575393676758, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8651922345161438, "num_tokens": 322232158.0, "step": 8449 }, { "epoch": 1.0749268540898105, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.65011978149414, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8661810159683228, "num_tokens": 322272170.0, "step": 8450 }, { "epoch": 1.075054064368401, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.81201934814453, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8633889555931091, "num_tokens": 322305496.0, "step": 8451 }, { "epoch": 1.0751812746469915, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 36.01421356201172, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8627297878265381, "num_tokens": 322347552.0, "step": 8452 }, { "epoch": 1.075308484925582, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.6732177734375, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8590580224990845, "num_tokens": 322386471.0, "step": 8453 }, { "epoch": 1.0754356952041726, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 36.12276077270508, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8225349187850952, "num_tokens": 322431019.0, "step": 8454 }, { "epoch": 1.0755629054827631, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.524131774902344, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8771184682846069, "num_tokens": 322471525.0, "step": 8455 }, { "epoch": 1.0756901157613534, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.894710540771484, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8607407808303833, "num_tokens": 322516594.0, "step": 8456 }, { "epoch": 1.075817326039944, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.86545944213867, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8657262325286865, "num_tokens": 322557837.0, "step": 8457 }, { "epoch": 1.0759445363185345, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.309593200683594, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8528231978416443, "num_tokens": 322594165.0, "step": 8458 }, { "epoch": 1.076071746597125, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.075103759765625, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8416979908943176, "num_tokens": 322637083.0, "step": 8459 }, { "epoch": 1.0761989568757155, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.80072021484375, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8568763732910156, "num_tokens": 322673216.0, "step": 8460 }, { "epoch": 1.076326167154306, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.008670806884766, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8610068559646606, "num_tokens": 322709150.0, "step": 8461 }, { "epoch": 1.0764533774328966, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.749732971191406, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8630233407020569, "num_tokens": 322747893.0, "step": 8462 }, { "epoch": 1.0765805877114871, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.859375, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8407658338546753, "num_tokens": 322781667.0, "step": 8463 }, { "epoch": 1.0767077979900777, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.692176818847656, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8567297458648682, "num_tokens": 322818008.0, "step": 8464 }, { "epoch": 1.0768350082686682, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.382423400878906, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8382400274276733, "num_tokens": 322856528.0, "step": 8465 }, { "epoch": 1.0769622185472587, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.85492706298828, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8601652383804321, "num_tokens": 322888798.0, "step": 8466 }, { "epoch": 1.077089428825849, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.62297439575195, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8588462471961975, "num_tokens": 322923301.0, "step": 8467 }, { "epoch": 1.0772166391044395, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.59436798095703, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8706892132759094, "num_tokens": 322964253.0, "step": 8468 }, { "epoch": 1.07734384938303, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.92365646362305, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8570164442062378, "num_tokens": 323006926.0, "step": 8469 }, { "epoch": 1.0774710596616206, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.54017639160156, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8707398772239685, "num_tokens": 323047541.0, "step": 8470 }, { "epoch": 1.0775982699402111, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.165916442871094, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8499401807785034, "num_tokens": 323084272.0, "step": 8471 }, { "epoch": 1.0777254802188017, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.39454650878906, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8687615394592285, "num_tokens": 323116698.0, "step": 8472 }, { "epoch": 1.0778526904973922, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.83875274658203, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8658996820449829, "num_tokens": 323158836.0, "step": 8473 }, { "epoch": 1.0779799007759827, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.917266845703125, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8539004921913147, "num_tokens": 323193777.0, "step": 8474 }, { "epoch": 1.0781071110545732, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.32027053833008, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8584135174751282, "num_tokens": 323232546.0, "step": 8475 }, { "epoch": 1.0782343213331638, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.958370208740234, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8581537008285522, "num_tokens": 323269403.0, "step": 8476 }, { "epoch": 1.0783615316117543, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.537498474121094, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8709684610366821, "num_tokens": 323305982.0, "step": 8477 }, { "epoch": 1.0784887418903448, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.960567474365234, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8507083058357239, "num_tokens": 323341032.0, "step": 8478 }, { "epoch": 1.0786159521689354, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.73334884643555, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8521958589553833, "num_tokens": 323381647.0, "step": 8479 }, { "epoch": 1.0787431624475257, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.6992301940918, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8619362115859985, "num_tokens": 323417185.0, "step": 8480 }, { "epoch": 1.0788703727261162, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.78323745727539, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8595261573791504, "num_tokens": 323454624.0, "step": 8481 }, { "epoch": 1.0789975830047067, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.51171875, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8590894937515259, "num_tokens": 323492862.0, "step": 8482 }, { "epoch": 1.0791247932832972, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.805030822753906, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.854804515838623, "num_tokens": 323536012.0, "step": 8483 }, { "epoch": 1.0792520035618878, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.65940475463867, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8625019788742065, "num_tokens": 323573495.0, "step": 8484 }, { "epoch": 1.0793792138404783, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.7670783996582, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8678584694862366, "num_tokens": 323610900.0, "step": 8485 }, { "epoch": 1.0795064241190688, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.65544891357422, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8572934865951538, "num_tokens": 323650983.0, "step": 8486 }, { "epoch": 1.0796336343976594, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 36.169742584228516, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8499332070350647, "num_tokens": 323690155.0, "step": 8487 }, { "epoch": 1.0797608446762499, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.701393127441406, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8721415996551514, "num_tokens": 323732212.0, "step": 8488 }, { "epoch": 1.0798880549548404, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.159305572509766, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.868190586566925, "num_tokens": 323773400.0, "step": 8489 }, { "epoch": 1.080015265233431, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.51052474975586, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8652026057243347, "num_tokens": 323812753.0, "step": 8490 }, { "epoch": 1.0801424755120215, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.93909454345703, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8608582019805908, "num_tokens": 323855835.0, "step": 8491 }, { "epoch": 1.0802696857906118, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 36.168800354003906, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8722860813140869, "num_tokens": 323896065.0, "step": 8492 }, { "epoch": 1.0803968960692023, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.76737976074219, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8619990944862366, "num_tokens": 323934351.0, "step": 8493 }, { "epoch": 1.0805241063477928, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.86992645263672, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8474854230880737, "num_tokens": 323968878.0, "step": 8494 }, { "epoch": 1.0806513166263834, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.93368911743164, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.857122540473938, "num_tokens": 324004838.0, "step": 8495 }, { "epoch": 1.080778526904974, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.67931365966797, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8737887740135193, "num_tokens": 324040851.0, "step": 8496 }, { "epoch": 1.0809057371835644, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 36.240753173828125, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8468278646469116, "num_tokens": 324084675.0, "step": 8497 }, { "epoch": 1.081032947462155, "ewc_loss": 0.10791015625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.916854858398438e-05, "grad_norm": 35.72856903076172, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8635456562042236, "num_tokens": 324122136.0, "step": 8498 }, { "epoch": 1.0811601577407455, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.70573043823242, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8571361303329468, "num_tokens": 324153142.0, "step": 8499 }, { "epoch": 1.081287368019336, "ewc_loss": 0.1083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 8.96453857421875e-05, "grad_norm": 35.539615631103516, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8394489884376526, "num_tokens": 324187244.0, "step": 8500 }, { "epoch": 1.0814145782979265, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.81801223754883, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8607041835784912, "num_tokens": 324225644.0, "step": 8501 }, { "epoch": 1.081541788576517, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.48095703125, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8664595484733582, "num_tokens": 324270394.0, "step": 8502 }, { "epoch": 1.0816689988551076, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.71083068847656, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8697803616523743, "num_tokens": 324311244.0, "step": 8503 }, { "epoch": 1.0817962091336981, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.975624084472656, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8408638834953308, "num_tokens": 324350821.0, "step": 8504 }, { "epoch": 1.0819234194122884, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.36409378051758, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8579280376434326, "num_tokens": 324388255.0, "step": 8505 }, { "epoch": 1.082050629690879, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.035709381103516, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.834004282951355, "num_tokens": 324430068.0, "step": 8506 }, { "epoch": 1.0821778399694695, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.4827880859375, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8643590211868286, "num_tokens": 324465341.0, "step": 8507 }, { "epoch": 1.08230505024806, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.9378662109375, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8665076494216919, "num_tokens": 324502199.0, "step": 8508 }, { "epoch": 1.0824322605266505, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.541316986083984, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8599909543991089, "num_tokens": 324535898.0, "step": 8509 }, { "epoch": 1.082559470805241, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.697025299072266, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8486899137496948, "num_tokens": 324574404.0, "step": 8510 }, { "epoch": 1.0826866810838316, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.453617095947266, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8572441339492798, "num_tokens": 324611743.0, "step": 8511 }, { "epoch": 1.0828138913624221, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.94648361206055, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8485288619995117, "num_tokens": 324653126.0, "step": 8512 }, { "epoch": 1.0829411016410126, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.647037506103516, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8645555377006531, "num_tokens": 324692957.0, "step": 8513 }, { "epoch": 1.0830683119196032, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.05085372924805, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8537055850028992, "num_tokens": 324737214.0, "step": 8514 }, { "epoch": 1.0831955221981937, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.636653900146484, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8706082105636597, "num_tokens": 324773098.0, "step": 8515 }, { "epoch": 1.083322732476784, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.12019348144531, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8464496731758118, "num_tokens": 324815846.0, "step": 8516 }, { "epoch": 1.0834499427553745, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.57290267944336, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8541523218154907, "num_tokens": 324855912.0, "step": 8517 }, { "epoch": 1.083577153033965, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.871463775634766, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8513987064361572, "num_tokens": 324893090.0, "step": 8518 }, { "epoch": 1.0837043633125556, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.766212463378906, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8597410917282104, "num_tokens": 324929779.0, "step": 8519 }, { "epoch": 1.0838315735911461, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.04943084716797, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8642331957817078, "num_tokens": 324961349.0, "step": 8520 }, { "epoch": 1.0839587838697367, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.836158752441406, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.856991171836853, "num_tokens": 324998688.0, "step": 8521 }, { "epoch": 1.0840859941483272, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.923133850097656, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8501506447792053, "num_tokens": 325040601.0, "step": 8522 }, { "epoch": 1.0842132044269177, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.874305725097656, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8626755475997925, "num_tokens": 325081729.0, "step": 8523 }, { "epoch": 1.0843404147055082, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.82665252685547, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8482638597488403, "num_tokens": 325118886.0, "step": 8524 }, { "epoch": 1.0844676249840988, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.63918685913086, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.876160740852356, "num_tokens": 325156417.0, "step": 8525 }, { "epoch": 1.0845948352626893, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.798580169677734, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8701666593551636, "num_tokens": 325200902.0, "step": 8526 }, { "epoch": 1.0847220455412798, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.875213623046875, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.860326886177063, "num_tokens": 325233362.0, "step": 8527 }, { "epoch": 1.0848492558198704, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.719444274902344, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8587079048156738, "num_tokens": 325274646.0, "step": 8528 }, { "epoch": 1.0849764660984607, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.06649398803711, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.87485671043396, "num_tokens": 325304735.0, "step": 8529 }, { "epoch": 1.0851036763770512, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.45488739013672, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.861830472946167, "num_tokens": 325339833.0, "step": 8530 }, { "epoch": 1.0852308866556417, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.21421813964844, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8467738032341003, "num_tokens": 325373838.0, "step": 8531 }, { "epoch": 1.0853580969342322, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.5357780456543, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8800780177116394, "num_tokens": 325412913.0, "step": 8532 }, { "epoch": 1.0854853072128228, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.14670944213867, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8639727830886841, "num_tokens": 325451830.0, "step": 8533 }, { "epoch": 1.0856125174914133, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.82338333129883, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8420397639274597, "num_tokens": 325487162.0, "step": 8534 }, { "epoch": 1.0857397277700038, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.606956481933594, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8686374425888062, "num_tokens": 325525906.0, "step": 8535 }, { "epoch": 1.0858669380485944, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.8731803894043, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8721427917480469, "num_tokens": 325559925.0, "step": 8536 }, { "epoch": 1.0859941483271849, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.91789245605469, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8575138449668884, "num_tokens": 325599980.0, "step": 8537 }, { "epoch": 1.0861213586057754, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.976722717285156, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8410180807113647, "num_tokens": 325631745.0, "step": 8538 }, { "epoch": 1.086248568884366, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.20116424560547, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8556732535362244, "num_tokens": 325673583.0, "step": 8539 }, { "epoch": 1.0863757791629565, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.97456359863281, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8502871990203857, "num_tokens": 325712995.0, "step": 8540 }, { "epoch": 1.0865029894415468, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 36.079463958740234, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8649337291717529, "num_tokens": 325748259.0, "step": 8541 }, { "epoch": 1.0866301997201373, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.134368896484375, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8640013337135315, "num_tokens": 325787676.0, "step": 8542 }, { "epoch": 1.0867574099987278, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.91587829589844, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8339512348175049, "num_tokens": 325827819.0, "step": 8543 }, { "epoch": 1.0868846202773184, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 36.08223342895508, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8675389289855957, "num_tokens": 325866328.0, "step": 8544 }, { "epoch": 1.0870118305559089, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.91067886352539, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8534218072891235, "num_tokens": 325906752.0, "step": 8545 }, { "epoch": 1.0871390408344994, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.90501022338867, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8539741039276123, "num_tokens": 325943765.0, "step": 8546 }, { "epoch": 1.08726625111309, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.81666946411133, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8642600774765015, "num_tokens": 325981471.0, "step": 8547 }, { "epoch": 1.0873934613916805, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.55338668823242, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.864949643611908, "num_tokens": 326017253.0, "step": 8548 }, { "epoch": 1.087520671670271, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.14841079711914, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8586913347244263, "num_tokens": 326053484.0, "step": 8549 }, { "epoch": 1.0876478819488615, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.28264236450195, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8648025989532471, "num_tokens": 326084779.0, "step": 8550 }, { "epoch": 1.087775092227452, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.83031463623047, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.855983316898346, "num_tokens": 326120795.0, "step": 8551 }, { "epoch": 1.0879023025060426, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.51593017578125, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8570784330368042, "num_tokens": 326155991.0, "step": 8552 }, { "epoch": 1.0880295127846331, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.642967224121094, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8564706444740295, "num_tokens": 326195738.0, "step": 8553 }, { "epoch": 1.0881567230632234, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.667476654052734, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8575288653373718, "num_tokens": 326237773.0, "step": 8554 }, { "epoch": 1.088283933341814, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.62031173706055, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8717947006225586, "num_tokens": 326278589.0, "step": 8555 }, { "epoch": 1.0884111436204045, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.21702194213867, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8508764505386353, "num_tokens": 326316862.0, "step": 8556 }, { "epoch": 1.088538353898995, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.04618453979492, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8562286496162415, "num_tokens": 326353892.0, "step": 8557 }, { "epoch": 1.0886655641775855, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.30128860473633, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8630812764167786, "num_tokens": 326389478.0, "step": 8558 }, { "epoch": 1.088792774456176, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.73468780517578, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8523094058036804, "num_tokens": 326428072.0, "step": 8559 }, { "epoch": 1.0889199847347666, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.68451690673828, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8604621291160583, "num_tokens": 326466374.0, "step": 8560 }, { "epoch": 1.0890471950133571, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.87096405029297, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8546164035797119, "num_tokens": 326501081.0, "step": 8561 }, { "epoch": 1.0891744052919476, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.751529693603516, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8672390580177307, "num_tokens": 326532539.0, "step": 8562 }, { "epoch": 1.0893016155705382, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.4728889465332, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8649306297302246, "num_tokens": 326569481.0, "step": 8563 }, { "epoch": 1.0894288258491287, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.006988525390625, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8565531373023987, "num_tokens": 326609746.0, "step": 8564 }, { "epoch": 1.089556036127719, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.79718017578125, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8478597402572632, "num_tokens": 326652379.0, "step": 8565 }, { "epoch": 1.0896832464063095, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.639129638671875, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8546793460845947, "num_tokens": 326687617.0, "step": 8566 }, { "epoch": 1.0898104566849, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.880104064941406, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8493560552597046, "num_tokens": 326724155.0, "step": 8567 }, { "epoch": 1.0899376669634906, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.68868637084961, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8572195768356323, "num_tokens": 326768234.0, "step": 8568 }, { "epoch": 1.0900648772420811, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.04352951049805, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8649981617927551, "num_tokens": 326810423.0, "step": 8569 }, { "epoch": 1.0901920875206716, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.820594787597656, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8563177585601807, "num_tokens": 326851834.0, "step": 8570 }, { "epoch": 1.0903192977992622, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.9385871887207, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8448655605316162, "num_tokens": 326889334.0, "step": 8571 }, { "epoch": 1.0904465080778527, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.3997688293457, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8459048271179199, "num_tokens": 326931884.0, "step": 8572 }, { "epoch": 1.0905737183564432, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.1000862121582, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8677902817726135, "num_tokens": 326969043.0, "step": 8573 }, { "epoch": 1.0907009286350338, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.8133659362793, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8602895140647888, "num_tokens": 326998369.0, "step": 8574 }, { "epoch": 1.0908281389136243, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.80881881713867, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8583498001098633, "num_tokens": 327036909.0, "step": 8575 }, { "epoch": 1.0909553491922148, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.776859283447266, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8511528968811035, "num_tokens": 327076053.0, "step": 8576 }, { "epoch": 1.0910825594708053, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.46851348876953, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8658122420310974, "num_tokens": 327112285.0, "step": 8577 }, { "epoch": 1.0912097697493957, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.24991226196289, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8640137910842896, "num_tokens": 327151591.0, "step": 8578 }, { "epoch": 1.0913369800279862, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.2513427734375, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.842483401298523, "num_tokens": 327193600.0, "step": 8579 }, { "epoch": 1.0914641903065767, "ewc_loss": 0.10888671875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.012222290039062e-05, "grad_norm": 35.45103073120117, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8478658199310303, "num_tokens": 327230249.0, "step": 8580 }, { "epoch": 1.0915914005851672, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.12460708618164, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8728883862495422, "num_tokens": 327270043.0, "step": 8581 }, { "epoch": 1.0917186108637578, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.39937210083008, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.86021888256073, "num_tokens": 327309396.0, "step": 8582 }, { "epoch": 1.0918458211423483, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.512325286865234, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8664827942848206, "num_tokens": 327349497.0, "step": 8583 }, { "epoch": 1.0919730314209388, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.43475341796875, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8505263924598694, "num_tokens": 327385648.0, "step": 8584 }, { "epoch": 1.0921002416995294, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.48853302001953, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8609045147895813, "num_tokens": 327426379.0, "step": 8585 }, { "epoch": 1.0922274519781199, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 35.439876556396484, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8634665012359619, "num_tokens": 327467555.0, "step": 8586 }, { "epoch": 1.0923546622567104, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.3135871887207, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8543597459793091, "num_tokens": 327505303.0, "step": 8587 }, { "epoch": 1.092481872535301, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.703765869140625, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8641729950904846, "num_tokens": 327547854.0, "step": 8588 }, { "epoch": 1.0926090828138915, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.97452926635742, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8595340251922607, "num_tokens": 327584132.0, "step": 8589 }, { "epoch": 1.0927362930924818, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.80471420288086, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8469584584236145, "num_tokens": 327629250.0, "step": 8590 }, { "epoch": 1.0928635033710723, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.01858139038086, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8509997725486755, "num_tokens": 327666988.0, "step": 8591 }, { "epoch": 1.0929907136496628, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.59910583496094, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8602738380432129, "num_tokens": 327705955.0, "step": 8592 }, { "epoch": 1.0931179239282534, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.547325134277344, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8499664664268494, "num_tokens": 327746362.0, "step": 8593 }, { "epoch": 1.0932451342068439, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.6690788269043, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8589057922363281, "num_tokens": 327789034.0, "step": 8594 }, { "epoch": 1.0933723444854344, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.163108825683594, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8469352722167969, "num_tokens": 327824203.0, "step": 8595 }, { "epoch": 1.093499554764025, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.70283508300781, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8588019609451294, "num_tokens": 327860340.0, "step": 8596 }, { "epoch": 1.0936267650426155, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.164608001708984, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8507036566734314, "num_tokens": 327901709.0, "step": 8597 }, { "epoch": 1.093753975321206, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.77987289428711, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.858799397945404, "num_tokens": 327940327.0, "step": 8598 }, { "epoch": 1.0938811855997965, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.24519348144531, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8625613451004028, "num_tokens": 327977966.0, "step": 8599 }, { "epoch": 1.094008395878387, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.54530715942383, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8534095287322998, "num_tokens": 328021143.0, "step": 8600 }, { "epoch": 1.0941356061569776, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.092002868652344, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8637710213661194, "num_tokens": 328060792.0, "step": 8601 }, { "epoch": 1.094262816435568, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.136112213134766, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8587325215339661, "num_tokens": 328099698.0, "step": 8602 }, { "epoch": 1.0943900267141584, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.67789840698242, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8613733649253845, "num_tokens": 328141075.0, "step": 8603 }, { "epoch": 1.094517236992749, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.438568115234375, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8501394987106323, "num_tokens": 328178945.0, "step": 8604 }, { "epoch": 1.0946444472713395, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.665802001953125, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8322080373764038, "num_tokens": 328214423.0, "step": 8605 }, { "epoch": 1.09477165754993, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.288230895996094, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8585271835327148, "num_tokens": 328248640.0, "step": 8606 }, { "epoch": 1.0948988678285205, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.53184127807617, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8365285992622375, "num_tokens": 328283016.0, "step": 8607 }, { "epoch": 1.095026078107111, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.17626190185547, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8629924654960632, "num_tokens": 328322377.0, "step": 8608 }, { "epoch": 1.0951532883857016, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.81851577758789, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8599966168403625, "num_tokens": 328359409.0, "step": 8609 }, { "epoch": 1.0952804986642921, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.083194732666016, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.862544059753418, "num_tokens": 328401257.0, "step": 8610 }, { "epoch": 1.0954077089428826, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.81219482421875, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8604640960693359, "num_tokens": 328435886.0, "step": 8611 }, { "epoch": 1.0955349192214732, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.821197509765625, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8474962711334229, "num_tokens": 328472091.0, "step": 8612 }, { "epoch": 1.0956621295000637, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.9854621887207, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8504768013954163, "num_tokens": 328509453.0, "step": 8613 }, { "epoch": 1.095789339778654, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.76335906982422, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8575978875160217, "num_tokens": 328545590.0, "step": 8614 }, { "epoch": 1.0959165500572445, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.33049392700195, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8467636704444885, "num_tokens": 328583093.0, "step": 8615 }, { "epoch": 1.096043760335835, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.624656677246094, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8408106565475464, "num_tokens": 328625312.0, "step": 8616 }, { "epoch": 1.0961709706144256, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.954620361328125, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8523544073104858, "num_tokens": 328655392.0, "step": 8617 }, { "epoch": 1.0962981808930161, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.84428787231445, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8415476679801941, "num_tokens": 328693686.0, "step": 8618 }, { "epoch": 1.0964253911716066, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.96528625488281, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8679683208465576, "num_tokens": 328728051.0, "step": 8619 }, { "epoch": 1.0965526014501972, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.785343170166016, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8460944294929504, "num_tokens": 328768091.0, "step": 8620 }, { "epoch": 1.0966798117287877, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.65502166748047, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8525398969650269, "num_tokens": 328803851.0, "step": 8621 }, { "epoch": 1.0968070220073782, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.11989974975586, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8681665062904358, "num_tokens": 328837429.0, "step": 8622 }, { "epoch": 1.0969342322859688, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.95276641845703, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8581358790397644, "num_tokens": 328874264.0, "step": 8623 }, { "epoch": 1.0970614425645593, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.957393646240234, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8593652248382568, "num_tokens": 328911627.0, "step": 8624 }, { "epoch": 1.0971886528431498, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 36.078067779541016, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8502068519592285, "num_tokens": 328944471.0, "step": 8625 }, { "epoch": 1.0973158631217403, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.94738006591797, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8652722239494324, "num_tokens": 328983826.0, "step": 8626 }, { "epoch": 1.0974430734003306, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.13991928100586, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8587229251861572, "num_tokens": 329019385.0, "step": 8627 }, { "epoch": 1.0975702836789212, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.9942741394043, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.845743715763092, "num_tokens": 329065179.0, "step": 8628 }, { "epoch": 1.0976974939575117, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.11353302001953, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.865149736404419, "num_tokens": 329101077.0, "step": 8629 }, { "epoch": 1.0978247042361022, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.711570739746094, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.861308217048645, "num_tokens": 329137538.0, "step": 8630 }, { "epoch": 1.0979519145146928, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.94758224487305, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8704855442047119, "num_tokens": 329171952.0, "step": 8631 }, { "epoch": 1.0980791247932833, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.81571960449219, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8659907579421997, "num_tokens": 329214956.0, "step": 8632 }, { "epoch": 1.0982063350718738, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.17940139770508, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8460414409637451, "num_tokens": 329254228.0, "step": 8633 }, { "epoch": 1.0983335453504643, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.29414749145508, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.859511137008667, "num_tokens": 329288337.0, "step": 8634 }, { "epoch": 1.0984607556290549, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.06892013549805, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8517758846282959, "num_tokens": 329329118.0, "step": 8635 }, { "epoch": 1.0985879659076454, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.956363677978516, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8436568975448608, "num_tokens": 329366301.0, "step": 8636 }, { "epoch": 1.098715176186236, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.80924606323242, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8455737829208374, "num_tokens": 329401612.0, "step": 8637 }, { "epoch": 1.0988423864648265, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.05226135253906, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8601011037826538, "num_tokens": 329440403.0, "step": 8638 }, { "epoch": 1.0989695967434168, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.07380676269531, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8683481812477112, "num_tokens": 329478699.0, "step": 8639 }, { "epoch": 1.0990968070220073, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.23430252075195, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.855850100517273, "num_tokens": 329517860.0, "step": 8640 }, { "epoch": 1.0992240173005978, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.960811614990234, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8629484176635742, "num_tokens": 329555385.0, "step": 8641 }, { "epoch": 1.0993512275791884, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.99158477783203, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8529638051986694, "num_tokens": 329594775.0, "step": 8642 }, { "epoch": 1.0994784378577789, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.01837921142578, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8617873191833496, "num_tokens": 329629872.0, "step": 8643 }, { "epoch": 1.0996056481363694, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.007232666015625, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8525593876838684, "num_tokens": 329665263.0, "step": 8644 }, { "epoch": 1.09973285841496, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.95488357543945, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.84733647108078, "num_tokens": 329705853.0, "step": 8645 }, { "epoch": 1.0998600686935505, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.666351318359375, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.864454448223114, "num_tokens": 329741690.0, "step": 8646 }, { "epoch": 1.099987278972141, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.301048278808594, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8508745431900024, "num_tokens": 329781548.0, "step": 8647 }, { "epoch": 1.1001144892507315, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.850196838378906, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8653760552406311, "num_tokens": 329822304.0, "step": 8648 }, { "epoch": 1.100241699529322, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.56748962402344, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8591859340667725, "num_tokens": 329856065.0, "step": 8649 }, { "epoch": 1.1003689098079126, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.66142272949219, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8420776128768921, "num_tokens": 329895002.0, "step": 8650 }, { "epoch": 1.100496120086503, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.56264877319336, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8523443341255188, "num_tokens": 329938810.0, "step": 8651 }, { "epoch": 1.1006233303650934, "ewc_loss": 0.109375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.059906005859375e-05, "grad_norm": 35.9773063659668, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8701439499855042, "num_tokens": 329976513.0, "step": 8652 }, { "epoch": 1.100750540643684, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.24580383300781, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8651883602142334, "num_tokens": 330014013.0, "step": 8653 }, { "epoch": 1.1008777509222745, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.1552734375e-05, "grad_norm": 36.06629943847656, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8543643951416016, "num_tokens": 330052577.0, "step": 8654 }, { "epoch": 1.101004961200865, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.950992584228516, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8560330271720886, "num_tokens": 330091385.0, "step": 8655 }, { "epoch": 1.1011321714794555, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.09981918334961, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8658753633499146, "num_tokens": 330130285.0, "step": 8656 }, { "epoch": 1.101259381758046, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.90438461303711, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8425955772399902, "num_tokens": 330166823.0, "step": 8657 }, { "epoch": 1.1013865920366366, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.07230758666992, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8592012524604797, "num_tokens": 330200655.0, "step": 8658 }, { "epoch": 1.101513802315227, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.576969146728516, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8559530377388, "num_tokens": 330243656.0, "step": 8659 }, { "epoch": 1.1016410125938176, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.50422286987305, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8495438694953918, "num_tokens": 330287200.0, "step": 8660 }, { "epoch": 1.1017682228724082, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.63283920288086, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8605172634124756, "num_tokens": 330324794.0, "step": 8661 }, { "epoch": 1.1018954331509987, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.491153717041016, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8589182496070862, "num_tokens": 330365297.0, "step": 8662 }, { "epoch": 1.102022643429589, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 35.403709411621094, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8760977387428284, "num_tokens": 330408046.0, "step": 8663 }, { "epoch": 1.1021498537081795, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.74842071533203, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8549643158912659, "num_tokens": 330447815.0, "step": 8664 }, { "epoch": 1.10227706398677, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.31031799316406, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8455737829208374, "num_tokens": 330487796.0, "step": 8665 }, { "epoch": 1.1024042742653606, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.654991149902344, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.868470311164856, "num_tokens": 330531155.0, "step": 8666 }, { "epoch": 1.1025314845439511, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.779781341552734, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8663509488105774, "num_tokens": 330568611.0, "step": 8667 }, { "epoch": 1.1026586948225416, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.16275405883789, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8529731035232544, "num_tokens": 330608085.0, "step": 8668 }, { "epoch": 1.1027859051011322, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.70648956298828, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8396795988082886, "num_tokens": 330644776.0, "step": 8669 }, { "epoch": 1.1029131153797227, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.04490280151367, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8458185791969299, "num_tokens": 330686401.0, "step": 8670 }, { "epoch": 1.1030403256583132, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.244895935058594, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8612972497940063, "num_tokens": 330723115.0, "step": 8671 }, { "epoch": 1.1031675359369038, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.6038818359375, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8721165060997009, "num_tokens": 330761170.0, "step": 8672 }, { "epoch": 1.1032947462154943, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.32393264770508, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8572000861167908, "num_tokens": 330790230.0, "step": 8673 }, { "epoch": 1.1034219564940848, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.54621887207031, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8692022562026978, "num_tokens": 330826178.0, "step": 8674 }, { "epoch": 1.1035491667726753, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.53718566894531, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8636540174484253, "num_tokens": 330866341.0, "step": 8675 }, { "epoch": 1.1036763770512656, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.66405487060547, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8427577018737793, "num_tokens": 330903955.0, "step": 8676 }, { "epoch": 1.1038035873298562, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.24820327758789, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8549514412879944, "num_tokens": 330943024.0, "step": 8677 }, { "epoch": 1.1039307976084467, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.89950942993164, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8523675203323364, "num_tokens": 330981004.0, "step": 8678 }, { "epoch": 1.1040580078870372, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.19809341430664, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8563687801361084, "num_tokens": 331021532.0, "step": 8679 }, { "epoch": 1.1041852181656278, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 35.829010009765625, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8643147945404053, "num_tokens": 331058010.0, "step": 8680 }, { "epoch": 1.1043124284442183, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.144439697265625, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8663780093193054, "num_tokens": 331090713.0, "step": 8681 }, { "epoch": 1.1044396387228088, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.82160568237305, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8635154962539673, "num_tokens": 331127532.0, "step": 8682 }, { "epoch": 1.1045668490013993, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.07670593261719, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8360462188720703, "num_tokens": 331172088.0, "step": 8683 }, { "epoch": 1.1046940592799899, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.209442138671875, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.852263331413269, "num_tokens": 331213419.0, "step": 8684 }, { "epoch": 1.1048212695585804, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.400611877441406, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8564966320991516, "num_tokens": 331250102.0, "step": 8685 }, { "epoch": 1.104948479837171, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.066165924072266, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8592233657836914, "num_tokens": 331284597.0, "step": 8686 }, { "epoch": 1.1050756901157615, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.217533111572266, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8500102758407593, "num_tokens": 331330688.0, "step": 8687 }, { "epoch": 1.1052029003943518, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.04233932495117, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8620569705963135, "num_tokens": 331368777.0, "step": 8688 }, { "epoch": 1.1053301106729423, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.0108757019043, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8628857731819153, "num_tokens": 331414369.0, "step": 8689 }, { "epoch": 1.1054573209515328, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.95500946044922, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8639082908630371, "num_tokens": 331452788.0, "step": 8690 }, { "epoch": 1.1055845312301233, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.32917022705078, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8461789488792419, "num_tokens": 331489575.0, "step": 8691 }, { "epoch": 1.1057117415087139, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.84119415283203, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8747268915176392, "num_tokens": 331529129.0, "step": 8692 }, { "epoch": 1.1058389517873044, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.316532135009766, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8545504808425903, "num_tokens": 331564989.0, "step": 8693 }, { "epoch": 1.105966162065895, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.04448318481445, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8601603507995605, "num_tokens": 331602561.0, "step": 8694 }, { "epoch": 1.1060933723444855, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.297828674316406, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8594443202018738, "num_tokens": 331646902.0, "step": 8695 }, { "epoch": 1.106220582623076, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.9704475402832, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8594415783882141, "num_tokens": 331682347.0, "step": 8696 }, { "epoch": 1.1063477929016665, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.50514602661133, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8507744073867798, "num_tokens": 331725813.0, "step": 8697 }, { "epoch": 1.106475003180257, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.050750732421875, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.836865246295929, "num_tokens": 331767856.0, "step": 8698 }, { "epoch": 1.1066022134588476, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.692264556884766, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8668619394302368, "num_tokens": 331804238.0, "step": 8699 }, { "epoch": 1.106729423737438, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.92221450805664, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8575531244277954, "num_tokens": 331839963.0, "step": 8700 }, { "epoch": 1.1068566340160284, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.570518493652344, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8616276979446411, "num_tokens": 331881563.0, "step": 8701 }, { "epoch": 1.106983844294619, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 35.826690673828125, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.853278398513794, "num_tokens": 331925540.0, "step": 8702 }, { "epoch": 1.1071110545732095, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.774166107177734, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8663467764854431, "num_tokens": 331963171.0, "step": 8703 }, { "epoch": 1.1072382648518, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.922874450683594, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8770861625671387, "num_tokens": 331998310.0, "step": 8704 }, { "epoch": 1.1073654751303905, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.19647979736328, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8636044859886169, "num_tokens": 332034181.0, "step": 8705 }, { "epoch": 1.107492685408981, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.09917449951172, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8683592677116394, "num_tokens": 332075368.0, "step": 8706 }, { "epoch": 1.1076198956875716, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.81278991699219, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.868520975112915, "num_tokens": 332113281.0, "step": 8707 }, { "epoch": 1.107747105966162, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.653533935546875, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8498374223709106, "num_tokens": 332154206.0, "step": 8708 }, { "epoch": 1.1078743162447526, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.83089065551758, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8433412313461304, "num_tokens": 332195409.0, "step": 8709 }, { "epoch": 1.1080015265233432, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 35.65233612060547, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8670957088470459, "num_tokens": 332233836.0, "step": 8710 }, { "epoch": 1.1081287368019337, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.68824768066406, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8629502654075623, "num_tokens": 332274167.0, "step": 8711 }, { "epoch": 1.108255947080524, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.063785552978516, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8542540073394775, "num_tokens": 332315856.0, "step": 8712 }, { "epoch": 1.1083831573591145, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.83418273925781, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8579356670379639, "num_tokens": 332366365.0, "step": 8713 }, { "epoch": 1.108510367637705, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.20297622680664, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8554815053939819, "num_tokens": 332400411.0, "step": 8714 }, { "epoch": 1.1086375779162956, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.12221145629883, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8611806631088257, "num_tokens": 332441959.0, "step": 8715 }, { "epoch": 1.108764788194886, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.663204193115234, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.845594048500061, "num_tokens": 332479545.0, "step": 8716 }, { "epoch": 1.1088919984734766, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.066497802734375, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.868758499622345, "num_tokens": 332516240.0, "step": 8717 }, { "epoch": 1.1090192087520672, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.17534637451172, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.863480269908905, "num_tokens": 332553488.0, "step": 8718 }, { "epoch": 1.1091464190306577, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.29346466064453, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8490428924560547, "num_tokens": 332588646.0, "step": 8719 }, { "epoch": 1.1092736293092482, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.185523986816406, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8477222919464111, "num_tokens": 332629103.0, "step": 8720 }, { "epoch": 1.1094008395878387, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.588809967041016, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8642696142196655, "num_tokens": 332664820.0, "step": 8721 }, { "epoch": 1.1095280498664293, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.3851318359375, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8593785762786865, "num_tokens": 332699772.0, "step": 8722 }, { "epoch": 1.1096552601450198, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.01438522338867, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8666179776191711, "num_tokens": 332733478.0, "step": 8723 }, { "epoch": 1.1097824704236103, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.0898551940918, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8528845310211182, "num_tokens": 332774733.0, "step": 8724 }, { "epoch": 1.1099096807022006, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.30812072753906, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8345786333084106, "num_tokens": 332812970.0, "step": 8725 }, { "epoch": 1.1100368909807912, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.23958206176758, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8547343611717224, "num_tokens": 332854974.0, "step": 8726 }, { "epoch": 1.1101641012593817, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.512638092041016, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8444553017616272, "num_tokens": 332886462.0, "step": 8727 }, { "epoch": 1.1102913115379722, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.13763427734375, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8541948795318604, "num_tokens": 332928614.0, "step": 8728 }, { "epoch": 1.1104185218165628, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.5103645324707, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8582027554512024, "num_tokens": 332962447.0, "step": 8729 }, { "epoch": 1.1105457320951533, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.8826789855957, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.847926139831543, "num_tokens": 332991395.0, "step": 8730 }, { "epoch": 1.1106729423737438, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.5347900390625, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8502632975578308, "num_tokens": 333027852.0, "step": 8731 }, { "epoch": 1.1108001526523343, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.21572494506836, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8602745532989502, "num_tokens": 333063337.0, "step": 8732 }, { "epoch": 1.1109273629309249, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.68439865112305, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8476667404174805, "num_tokens": 333094632.0, "step": 8733 }, { "epoch": 1.1110545732095154, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.814517974853516, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8724662065505981, "num_tokens": 333134071.0, "step": 8734 }, { "epoch": 1.111181783488106, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.82560729980469, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8622589111328125, "num_tokens": 333171288.0, "step": 8735 }, { "epoch": 1.1113089937666965, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 35.917049407958984, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8650035858154297, "num_tokens": 333212111.0, "step": 8736 }, { "epoch": 1.1114362040452868, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.87017822265625, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8630819320678711, "num_tokens": 333247557.0, "step": 8737 }, { "epoch": 1.1115634143238773, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.20310974121094, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8590335845947266, "num_tokens": 333278705.0, "step": 8738 }, { "epoch": 1.1116906246024678, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.85521697998047, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8607594966888428, "num_tokens": 333316493.0, "step": 8739 }, { "epoch": 1.1118178348810583, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 36.23695755004883, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8610554933547974, "num_tokens": 333360384.0, "step": 8740 }, { "epoch": 1.1119450451596489, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.306156158447266, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8578217029571533, "num_tokens": 333395568.0, "step": 8741 }, { "epoch": 1.1120722554382394, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.942893981933594, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8620424866676331, "num_tokens": 333429267.0, "step": 8742 }, { "epoch": 1.11219946571683, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.070308685302734, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8654948472976685, "num_tokens": 333469908.0, "step": 8743 }, { "epoch": 1.1123266759954205, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.706058502197266, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8611377477645874, "num_tokens": 333507865.0, "step": 8744 }, { "epoch": 1.112453886274011, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.284915924072266, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8695958256721497, "num_tokens": 333544011.0, "step": 8745 }, { "epoch": 1.1125810965526015, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.21349334716797, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8600110411643982, "num_tokens": 333581949.0, "step": 8746 }, { "epoch": 1.112708306831192, "ewc_loss": 0.111328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.298324584960938e-05, "grad_norm": 36.46549987792969, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8662241697311401, "num_tokens": 333615994.0, "step": 8747 }, { "epoch": 1.1128355171097826, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.11531066894531, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8746632933616638, "num_tokens": 333654454.0, "step": 8748 }, { "epoch": 1.112962727388373, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.530555725097656, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8506579995155334, "num_tokens": 333700206.0, "step": 8749 }, { "epoch": 1.1130899376669634, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.051597595214844, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8613816499710083, "num_tokens": 333737538.0, "step": 8750 }, { "epoch": 1.113217147945554, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.599609375, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8594405651092529, "num_tokens": 333771237.0, "step": 8751 }, { "epoch": 1.1133443582241445, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.30255126953125, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8561859130859375, "num_tokens": 333806522.0, "step": 8752 }, { "epoch": 1.113471568502735, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.31541061401367, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8566722869873047, "num_tokens": 333846656.0, "step": 8753 }, { "epoch": 1.1135987787813255, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.12949752807617, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8502647876739502, "num_tokens": 333886294.0, "step": 8754 }, { "epoch": 1.113725989059916, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.55658721923828, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8559364080429077, "num_tokens": 333923868.0, "step": 8755 }, { "epoch": 1.1138531993385066, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.257198333740234, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8638614416122437, "num_tokens": 333957187.0, "step": 8756 }, { "epoch": 1.113980409617097, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.71099090576172, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8727802038192749, "num_tokens": 333999061.0, "step": 8757 }, { "epoch": 1.1141076198956876, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.20085144042969, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8535202741622925, "num_tokens": 334043595.0, "step": 8758 }, { "epoch": 1.1142348301742782, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.43160629272461, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8629405498504639, "num_tokens": 334078487.0, "step": 8759 }, { "epoch": 1.1143620404528687, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.346126556396484, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8668441772460938, "num_tokens": 334119610.0, "step": 8760 }, { "epoch": 1.114489250731459, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.35737991333008, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8535526990890503, "num_tokens": 334160149.0, "step": 8761 }, { "epoch": 1.1146164610100495, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.06953811645508, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8585115075111389, "num_tokens": 334198667.0, "step": 8762 }, { "epoch": 1.11474367128864, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.67313003540039, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8597802519798279, "num_tokens": 334233838.0, "step": 8763 }, { "epoch": 1.1148708815672306, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.867156982421875, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8508856296539307, "num_tokens": 334275703.0, "step": 8764 }, { "epoch": 1.114998091845821, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.74649429321289, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8706692457199097, "num_tokens": 334311119.0, "step": 8765 }, { "epoch": 1.1151253021244116, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.234127044677734, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8516491651535034, "num_tokens": 334348298.0, "step": 8766 }, { "epoch": 1.1152525124030022, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.636016845703125, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8444619178771973, "num_tokens": 334386986.0, "step": 8767 }, { "epoch": 1.1153797226815927, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.83222579956055, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8624545335769653, "num_tokens": 334423263.0, "step": 8768 }, { "epoch": 1.1155069329601832, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.730873107910156, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8744338750839233, "num_tokens": 334460363.0, "step": 8769 }, { "epoch": 1.1156341432387737, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.20237731933594, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8629999756813049, "num_tokens": 334501883.0, "step": 8770 }, { "epoch": 1.1157613535173643, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.50374221801758, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8582397699356079, "num_tokens": 334536242.0, "step": 8771 }, { "epoch": 1.1158885637959548, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.60246658325195, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8717570304870605, "num_tokens": 334569685.0, "step": 8772 }, { "epoch": 1.1160157740745453, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.75962448120117, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8619840145111084, "num_tokens": 334604946.0, "step": 8773 }, { "epoch": 1.1161429843531356, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.20073318481445, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8646470308303833, "num_tokens": 334645495.0, "step": 8774 }, { "epoch": 1.1162701946317262, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 37.60224533081055, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8531873822212219, "num_tokens": 334684239.0, "step": 8775 }, { "epoch": 1.1163974049103167, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 35.89864730834961, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8637955784797668, "num_tokens": 334713799.0, "step": 8776 }, { "epoch": 1.1165246151889072, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.28251266479492, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.879765510559082, "num_tokens": 334753374.0, "step": 8777 }, { "epoch": 1.1166518254674977, "ewc_loss": 0.10986328125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.107589721679688e-05, "grad_norm": 36.111263275146484, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8590855002403259, "num_tokens": 334792867.0, "step": 8778 }, { "epoch": 1.1167790357460883, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.840789794921875, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8553919792175293, "num_tokens": 334832664.0, "step": 8779 }, { "epoch": 1.1169062460246788, "ewc_loss": 0.1103515625, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.202957153320312e-05, "grad_norm": 36.19186019897461, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8512607216835022, "num_tokens": 334876763.0, "step": 8780 }, { "epoch": 1.1170334563032693, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.49053192138672, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8594847321510315, "num_tokens": 334909431.0, "step": 8781 }, { "epoch": 1.1171606665818599, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.049949645996094, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8578251600265503, "num_tokens": 334941939.0, "step": 8782 }, { "epoch": 1.1172878768604504, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.44635772705078, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8685175180435181, "num_tokens": 334981338.0, "step": 8783 }, { "epoch": 1.117415087139041, "ewc_loss": 0.11083984375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.250640869140625e-05, "grad_norm": 36.070213317871094, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8576067090034485, "num_tokens": 335020660.0, "step": 8784 }, { "epoch": 1.1175422974176314, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.32013702392578, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8610582947731018, "num_tokens": 335057377.0, "step": 8785 }, { "epoch": 1.1176695076962218, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.24242401123047, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8476786613464355, "num_tokens": 335092912.0, "step": 8786 }, { "epoch": 1.1177967179748123, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.29066848754883, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8730207085609436, "num_tokens": 335129157.0, "step": 8787 }, { "epoch": 1.1179239282534028, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.00382995605469, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8588494062423706, "num_tokens": 335168065.0, "step": 8788 }, { "epoch": 1.1180511385319933, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.316253662109375, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8581862449645996, "num_tokens": 335218876.0, "step": 8789 }, { "epoch": 1.1181783488105839, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.27824783325195, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.868293821811676, "num_tokens": 335256300.0, "step": 8790 }, { "epoch": 1.1183055590891744, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.25759506225586, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8693839907646179, "num_tokens": 335291035.0, "step": 8791 }, { "epoch": 1.118432769367765, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.277671813964844, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8708077669143677, "num_tokens": 335330092.0, "step": 8792 }, { "epoch": 1.1185599796463555, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.33546447753906, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8455394506454468, "num_tokens": 335367146.0, "step": 8793 }, { "epoch": 1.118687189924946, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.22739791870117, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8577948212623596, "num_tokens": 335405069.0, "step": 8794 }, { "epoch": 1.1188144002035365, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.283226013183594, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8493071794509888, "num_tokens": 335443779.0, "step": 8795 }, { "epoch": 1.118941610482127, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.20363235473633, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.850353479385376, "num_tokens": 335485978.0, "step": 8796 }, { "epoch": 1.1190688207607176, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.025230407714844, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8635311126708984, "num_tokens": 335530375.0, "step": 8797 }, { "epoch": 1.119196031039308, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.398956298828125, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8634485602378845, "num_tokens": 335569109.0, "step": 8798 }, { "epoch": 1.1193232413178984, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.30780792236328, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.843752384185791, "num_tokens": 335599283.0, "step": 8799 }, { "epoch": 1.119450451596489, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.512569427490234, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8663755655288696, "num_tokens": 335634792.0, "step": 8800 }, { "epoch": 1.1195776618750795, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 35.95452117919922, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8724434971809387, "num_tokens": 335672869.0, "step": 8801 }, { "epoch": 1.11970487215367, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.882225036621094, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8646667003631592, "num_tokens": 335714996.0, "step": 8802 }, { "epoch": 1.1198320824322605, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 35.919193267822266, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8513481616973877, "num_tokens": 335756884.0, "step": 8803 }, { "epoch": 1.119959292710851, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.47005081176758, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8685082197189331, "num_tokens": 335793935.0, "step": 8804 }, { "epoch": 1.1200865029894416, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.318565368652344, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8717416524887085, "num_tokens": 335834169.0, "step": 8805 }, { "epoch": 1.120213713268032, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.68759536743164, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8764116168022156, "num_tokens": 335872302.0, "step": 8806 }, { "epoch": 1.1203409235466226, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.2641716003418, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.860439121723175, "num_tokens": 335908074.0, "step": 8807 }, { "epoch": 1.1204681338252132, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.42366027832031, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8701226711273193, "num_tokens": 335946268.0, "step": 8808 }, { "epoch": 1.1205953441038037, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.38353729248047, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8605529069900513, "num_tokens": 335985364.0, "step": 8809 }, { "epoch": 1.120722554382394, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.56309127807617, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8622726202011108, "num_tokens": 336025300.0, "step": 8810 }, { "epoch": 1.1208497646609845, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.381072998046875, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8638650178909302, "num_tokens": 336062105.0, "step": 8811 }, { "epoch": 1.120976974939575, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.30276107788086, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.847684919834137, "num_tokens": 336102944.0, "step": 8812 }, { "epoch": 1.1211041852181656, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.32929611206055, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8866429328918457, "num_tokens": 336142964.0, "step": 8813 }, { "epoch": 1.121231395496756, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.450191497802734, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8623592853546143, "num_tokens": 336186962.0, "step": 8814 }, { "epoch": 1.1213586057753466, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.49568557739258, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8679060935974121, "num_tokens": 336220775.0, "step": 8815 }, { "epoch": 1.1214858160539372, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.267417907714844, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8561974763870239, "num_tokens": 336263067.0, "step": 8816 }, { "epoch": 1.1216130263325277, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.18272399902344, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8632691502571106, "num_tokens": 336308463.0, "step": 8817 }, { "epoch": 1.1217402366111182, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.64883804321289, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.872023344039917, "num_tokens": 336346965.0, "step": 8818 }, { "epoch": 1.1218674468897087, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 35.975589752197266, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8603605031967163, "num_tokens": 336382114.0, "step": 8819 }, { "epoch": 1.1219946571682993, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.93403625488281, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8640958070755005, "num_tokens": 336414020.0, "step": 8820 }, { "epoch": 1.1221218674468898, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.467960357666016, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8574933409690857, "num_tokens": 336452522.0, "step": 8821 }, { "epoch": 1.1222490777254803, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.92836380004883, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8713637590408325, "num_tokens": 336484593.0, "step": 8822 }, { "epoch": 1.1223762880040706, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.14562225341797, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8412064909934998, "num_tokens": 336518964.0, "step": 8823 }, { "epoch": 1.1225034982826612, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.14043426513672, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.847304105758667, "num_tokens": 336556320.0, "step": 8824 }, { "epoch": 1.1226307085612517, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.52922821044922, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.849604070186615, "num_tokens": 336595473.0, "step": 8825 }, { "epoch": 1.1227579188398422, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.83760452270508, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8599294424057007, "num_tokens": 336631793.0, "step": 8826 }, { "epoch": 1.1228851291184327, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.950782775878906, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.843953013420105, "num_tokens": 336671230.0, "step": 8827 }, { "epoch": 1.1230123393970233, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 37.06132507324219, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8603029847145081, "num_tokens": 336712015.0, "step": 8828 }, { "epoch": 1.1231395496756138, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.63145446777344, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8438031077384949, "num_tokens": 336741206.0, "step": 8829 }, { "epoch": 1.1232667599542043, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.68971252441406, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8522796630859375, "num_tokens": 336778838.0, "step": 8830 }, { "epoch": 1.1233939702327949, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.836734771728516, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8659183979034424, "num_tokens": 336815471.0, "step": 8831 }, { "epoch": 1.1235211805113854, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.59434127807617, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8608344793319702, "num_tokens": 336850870.0, "step": 8832 }, { "epoch": 1.123648390789976, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.73168182373047, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8642594218254089, "num_tokens": 336891779.0, "step": 8833 }, { "epoch": 1.1237756010685664, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.37508773803711, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8688690662384033, "num_tokens": 336931171.0, "step": 8834 }, { "epoch": 1.1239028113471567, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.52605056762695, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8707290887832642, "num_tokens": 336963080.0, "step": 8835 }, { "epoch": 1.1240300216257473, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.55922317504883, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8451720476150513, "num_tokens": 337000817.0, "step": 8836 }, { "epoch": 1.1241572319043378, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.76862335205078, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8527736663818359, "num_tokens": 337037249.0, "step": 8837 }, { "epoch": 1.1242844421829283, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.99615478515625, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8668807148933411, "num_tokens": 337077752.0, "step": 8838 }, { "epoch": 1.1244116524615189, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.97197341918945, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8563463687896729, "num_tokens": 337119649.0, "step": 8839 }, { "epoch": 1.1245388627401094, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.500423431396484, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8439633846282959, "num_tokens": 337154376.0, "step": 8840 }, { "epoch": 1.1246660730187, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.78386688232422, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8680662512779236, "num_tokens": 337193377.0, "step": 8841 }, { "epoch": 1.1247932832972904, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.19401168823242, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8483009338378906, "num_tokens": 337233176.0, "step": 8842 }, { "epoch": 1.124920493575881, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.66133117675781, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8442826271057129, "num_tokens": 337262995.0, "step": 8843 }, { "epoch": 1.1250477038544715, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.62234115600586, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8582108616828918, "num_tokens": 337302466.0, "step": 8844 }, { "epoch": 1.125174914133062, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.72868728637695, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8706111907958984, "num_tokens": 337341197.0, "step": 8845 }, { "epoch": 1.1253021244116526, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.78490447998047, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.861823558807373, "num_tokens": 337375602.0, "step": 8846 }, { "epoch": 1.125429334690243, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.28556823730469, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8505592942237854, "num_tokens": 337406685.0, "step": 8847 }, { "epoch": 1.1255565449688334, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.51316833496094, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8546029329299927, "num_tokens": 337447364.0, "step": 8848 }, { "epoch": 1.125683755247424, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.59468078613281, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8597527742385864, "num_tokens": 337487840.0, "step": 8849 }, { "epoch": 1.1258109655260145, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.45408630371094, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8634535074234009, "num_tokens": 337525117.0, "step": 8850 }, { "epoch": 1.125938175804605, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.584747314453125, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8525985479354858, "num_tokens": 337567024.0, "step": 8851 }, { "epoch": 1.1260653860831955, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.345436096191406, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.856861412525177, "num_tokens": 337600117.0, "step": 8852 }, { "epoch": 1.126192596361786, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.98399353027344, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8428745865821838, "num_tokens": 337637188.0, "step": 8853 }, { "epoch": 1.1263198066403766, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.407752990722656, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8549579381942749, "num_tokens": 337676492.0, "step": 8854 }, { "epoch": 1.126447016918967, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.08803939819336, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8727660179138184, "num_tokens": 337713468.0, "step": 8855 }, { "epoch": 1.1265742271975576, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 35.94681167602539, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.846074104309082, "num_tokens": 337750944.0, "step": 8856 }, { "epoch": 1.1267014374761481, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.73649978637695, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8728235960006714, "num_tokens": 337793802.0, "step": 8857 }, { "epoch": 1.1268286477547387, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.24263381958008, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.872592568397522, "num_tokens": 337828550.0, "step": 8858 }, { "epoch": 1.126955858033329, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.813533782958984, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8688064813613892, "num_tokens": 337862168.0, "step": 8859 }, { "epoch": 1.1270830683119195, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.329933166503906, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8560923933982849, "num_tokens": 337898837.0, "step": 8860 }, { "epoch": 1.12721027859051, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.63047790527344, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8446561098098755, "num_tokens": 337939608.0, "step": 8861 }, { "epoch": 1.1273374888691006, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.386314392089844, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8595174551010132, "num_tokens": 337983412.0, "step": 8862 }, { "epoch": 1.127464699147691, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.4984245300293, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8690485954284668, "num_tokens": 338022171.0, "step": 8863 }, { "epoch": 1.1275919094262816, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.29990768432617, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.855654239654541, "num_tokens": 338058838.0, "step": 8864 }, { "epoch": 1.1277191197048722, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.45582580566406, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8540319204330444, "num_tokens": 338095080.0, "step": 8865 }, { "epoch": 1.1278463299834627, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.60646057128906, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8420336246490479, "num_tokens": 338134206.0, "step": 8866 }, { "epoch": 1.1279735402620532, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.38479995727539, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8665511608123779, "num_tokens": 338173794.0, "step": 8867 }, { "epoch": 1.1281007505406437, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.64992141723633, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8559399843215942, "num_tokens": 338206591.0, "step": 8868 }, { "epoch": 1.1282279608192343, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.21406173706055, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8717478513717651, "num_tokens": 338238728.0, "step": 8869 }, { "epoch": 1.1283551710978248, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.67326736450195, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8681620359420776, "num_tokens": 338282372.0, "step": 8870 }, { "epoch": 1.1284823813764153, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.250797271728516, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8538599014282227, "num_tokens": 338319175.0, "step": 8871 }, { "epoch": 1.1286095916550056, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.490875244140625, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8555147647857666, "num_tokens": 338357967.0, "step": 8872 }, { "epoch": 1.1287368019335962, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.67092514038086, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8663971424102783, "num_tokens": 338391431.0, "step": 8873 }, { "epoch": 1.1288640122121867, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.30268478393555, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8684642314910889, "num_tokens": 338426298.0, "step": 8874 }, { "epoch": 1.1289912224907772, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.104068756103516, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8412462472915649, "num_tokens": 338468147.0, "step": 8875 }, { "epoch": 1.1291184327693677, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.66962432861328, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8420137763023376, "num_tokens": 338517095.0, "step": 8876 }, { "epoch": 1.1292456430479583, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.70277404785156, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8651033043861389, "num_tokens": 338549925.0, "step": 8877 }, { "epoch": 1.1293728533265488, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.951229095458984, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8629188537597656, "num_tokens": 338583966.0, "step": 8878 }, { "epoch": 1.1295000636051393, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.88375473022461, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8637182712554932, "num_tokens": 338622715.0, "step": 8879 }, { "epoch": 1.1296272738837299, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.74004364013672, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8451781868934631, "num_tokens": 338659920.0, "step": 8880 }, { "epoch": 1.1297544841623204, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.80728530883789, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8527196645736694, "num_tokens": 338698183.0, "step": 8881 }, { "epoch": 1.129881694440911, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.72068405151367, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8471570014953613, "num_tokens": 338735853.0, "step": 8882 }, { "epoch": 1.1300089047195012, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.54447937011719, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8489380478858948, "num_tokens": 338771714.0, "step": 8883 }, { "epoch": 1.1301361149980917, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.99912643432617, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8772972226142883, "num_tokens": 338813823.0, "step": 8884 }, { "epoch": 1.1302633252766823, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.44826889038086, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8631124496459961, "num_tokens": 338858137.0, "step": 8885 }, { "epoch": 1.1303905355552728, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.80607604980469, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.848120927810669, "num_tokens": 338893653.0, "step": 8886 }, { "epoch": 1.1305177458338633, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.80034637451172, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.840031623840332, "num_tokens": 338931230.0, "step": 8887 }, { "epoch": 1.1306449561124539, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.24732208251953, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8584328889846802, "num_tokens": 338966921.0, "step": 8888 }, { "epoch": 1.1307721663910444, "ewc_loss": 0.11181640625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.34600830078125e-05, "grad_norm": 36.475181579589844, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8401006460189819, "num_tokens": 339000957.0, "step": 8889 }, { "epoch": 1.130899376669635, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.876686096191406, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8512458205223083, "num_tokens": 339035876.0, "step": 8890 }, { "epoch": 1.1310265869482254, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.89788055419922, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8599322438240051, "num_tokens": 339073700.0, "step": 8891 }, { "epoch": 1.131153797226816, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.694969177246094, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8624445796012878, "num_tokens": 339108222.0, "step": 8892 }, { "epoch": 1.1312810075054065, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.719085693359375, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8726561069488525, "num_tokens": 339145749.0, "step": 8893 }, { "epoch": 1.131408217783997, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.68817901611328, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8445104360580444, "num_tokens": 339183852.0, "step": 8894 }, { "epoch": 1.1315354280625876, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.568092346191406, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8738672733306885, "num_tokens": 339218766.0, "step": 8895 }, { "epoch": 1.131662638341178, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.30880355834961, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8607218265533447, "num_tokens": 339259932.0, "step": 8896 }, { "epoch": 1.1317898486197684, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 37.004608154296875, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8635616302490234, "num_tokens": 339302201.0, "step": 8897 }, { "epoch": 1.131917058898359, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.4780158996582, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8292208909988403, "num_tokens": 339342091.0, "step": 8898 }, { "epoch": 1.1320442691769494, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.93123245239258, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8543148636817932, "num_tokens": 339379951.0, "step": 8899 }, { "epoch": 1.13217147945554, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.71405792236328, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.853577733039856, "num_tokens": 339422243.0, "step": 8900 }, { "epoch": 1.1322986897341305, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.5828742980957, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.872746467590332, "num_tokens": 339464969.0, "step": 8901 }, { "epoch": 1.132425900012721, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.779075622558594, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8625029921531677, "num_tokens": 339502382.0, "step": 8902 }, { "epoch": 1.1325531102913116, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.992061614990234, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8435425758361816, "num_tokens": 339542875.0, "step": 8903 }, { "epoch": 1.132680320569902, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 37.252193450927734, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8450713157653809, "num_tokens": 339580776.0, "step": 8904 }, { "epoch": 1.1328075308484926, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.95074462890625, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8724833726882935, "num_tokens": 339620195.0, "step": 8905 }, { "epoch": 1.1329347411270831, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.65622329711914, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8674712181091309, "num_tokens": 339659234.0, "step": 8906 }, { "epoch": 1.1330619514056737, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 37.055633544921875, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8503734469413757, "num_tokens": 339703259.0, "step": 8907 }, { "epoch": 1.133189161684264, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.664913177490234, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8673925399780273, "num_tokens": 339745901.0, "step": 8908 }, { "epoch": 1.1333163719628545, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.582305908203125, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8502242565155029, "num_tokens": 339790417.0, "step": 8909 }, { "epoch": 1.133443582241445, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.787227630615234, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8508089780807495, "num_tokens": 339827224.0, "step": 8910 }, { "epoch": 1.1335707925200356, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.91945266723633, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8521602153778076, "num_tokens": 339859932.0, "step": 8911 }, { "epoch": 1.133698002798626, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.62971115112305, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8490699529647827, "num_tokens": 339902416.0, "step": 8912 }, { "epoch": 1.1338252130772166, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.06297302246094, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8668951392173767, "num_tokens": 339940495.0, "step": 8913 }, { "epoch": 1.1339524233558071, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.55434036254883, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8402500152587891, "num_tokens": 339983010.0, "step": 8914 }, { "epoch": 1.1340796336343977, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.088951110839844, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8592998385429382, "num_tokens": 340020683.0, "step": 8915 }, { "epoch": 1.1342068439129882, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.785186767578125, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8505538702011108, "num_tokens": 340063179.0, "step": 8916 }, { "epoch": 1.1343340541915787, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.32111358642578, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.867978572845459, "num_tokens": 340100166.0, "step": 8917 }, { "epoch": 1.1344612644701693, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.804969787597656, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8549705147743225, "num_tokens": 340144325.0, "step": 8918 }, { "epoch": 1.1345884747487598, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.16291046142578, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8572481274604797, "num_tokens": 340180563.0, "step": 8919 }, { "epoch": 1.1347156850273503, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.90522766113281, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.845773458480835, "num_tokens": 340224621.0, "step": 8920 }, { "epoch": 1.1348428953059406, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.32371520996094, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8649230003356934, "num_tokens": 340264810.0, "step": 8921 }, { "epoch": 1.1349701055845312, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.1673469543457, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8699406385421753, "num_tokens": 340301158.0, "step": 8922 }, { "epoch": 1.1350973158631217, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.56550216674805, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8588112592697144, "num_tokens": 340339806.0, "step": 8923 }, { "epoch": 1.1352245261417122, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.12788391113281, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8638020753860474, "num_tokens": 340386412.0, "step": 8924 }, { "epoch": 1.1353517364203027, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.517704010009766, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8682103157043457, "num_tokens": 340431391.0, "step": 8925 }, { "epoch": 1.1354789466988933, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.808128356933594, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8631579875946045, "num_tokens": 340465904.0, "step": 8926 }, { "epoch": 1.1356061569774838, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.66442108154297, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8621633052825928, "num_tokens": 340504290.0, "step": 8927 }, { "epoch": 1.1357333672560743, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.32463073730469, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8540951013565063, "num_tokens": 340539100.0, "step": 8928 }, { "epoch": 1.1358605775346649, "ewc_loss": 0.11376953125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.515037536621094, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8724391460418701, "num_tokens": 340569005.0, "step": 8929 }, { "epoch": 1.1359877878132554, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.22254180908203, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8577643632888794, "num_tokens": 340613476.0, "step": 8930 }, { "epoch": 1.136114998091846, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.847938537597656, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.850090742111206, "num_tokens": 340647982.0, "step": 8931 }, { "epoch": 1.1362422083704362, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.837684631347656, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8364866971969604, "num_tokens": 340683270.0, "step": 8932 }, { "epoch": 1.1363694186490267, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.30617904663086, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8612470030784607, "num_tokens": 340719955.0, "step": 8933 }, { "epoch": 1.1364966289276173, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.71235656738281, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.857798159122467, "num_tokens": 340753934.0, "step": 8934 }, { "epoch": 1.1366238392062078, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.96279525756836, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8655178546905518, "num_tokens": 340787545.0, "step": 8935 }, { "epoch": 1.1367510494847983, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.7505989074707, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8683340549468994, "num_tokens": 340826362.0, "step": 8936 }, { "epoch": 1.1368782597633889, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.876678466796875, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8623758554458618, "num_tokens": 340857871.0, "step": 8937 }, { "epoch": 1.1370054700419794, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.30802917480469, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8655744791030884, "num_tokens": 340897736.0, "step": 8938 }, { "epoch": 1.13713268032057, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.21670150756836, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8543455600738525, "num_tokens": 340935774.0, "step": 8939 }, { "epoch": 1.1372598905991604, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.31702423095703, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8608442544937134, "num_tokens": 340971497.0, "step": 8940 }, { "epoch": 1.137387100877751, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.24749755859375, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8588666915893555, "num_tokens": 341016209.0, "step": 8941 }, { "epoch": 1.1375143111563415, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.368797302246094, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8526325821876526, "num_tokens": 341052137.0, "step": 8942 }, { "epoch": 1.137641521434932, "ewc_loss": 0.1123046875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.393692016601562e-05, "grad_norm": 36.541011810302734, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.862047553062439, "num_tokens": 341083417.0, "step": 8943 }, { "epoch": 1.1377687317135226, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.87675476074219, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8620111346244812, "num_tokens": 341127798.0, "step": 8944 }, { "epoch": 1.137895941992113, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.982177734375, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8450248837471008, "num_tokens": 341173485.0, "step": 8945 }, { "epoch": 1.1380231522707034, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.47715759277344, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.847832441329956, "num_tokens": 341212834.0, "step": 8946 }, { "epoch": 1.138150362549294, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.39055252075195, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8513250946998596, "num_tokens": 341246497.0, "step": 8947 }, { "epoch": 1.1382775728278844, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.088035583496094, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8424847722053528, "num_tokens": 341291498.0, "step": 8948 }, { "epoch": 1.138404783106475, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.1764030456543, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8517102003097534, "num_tokens": 341328104.0, "step": 8949 }, { "epoch": 1.1385319933850655, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.15385055541992, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8700120449066162, "num_tokens": 341359042.0, "step": 8950 }, { "epoch": 1.138659203663656, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.02743911743164, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8579227924346924, "num_tokens": 341403529.0, "step": 8951 }, { "epoch": 1.1387864139422466, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.56147003173828, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8651934862136841, "num_tokens": 341442509.0, "step": 8952 }, { "epoch": 1.138913624220837, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.66227340698242, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.856001079082489, "num_tokens": 341483015.0, "step": 8953 }, { "epoch": 1.1390408344994276, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.653560638427734, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8573381900787354, "num_tokens": 341519676.0, "step": 8954 }, { "epoch": 1.1391680447780181, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.59175109863281, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8585403561592102, "num_tokens": 341563483.0, "step": 8955 }, { "epoch": 1.1392952550566087, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.828948974609375, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8501840233802795, "num_tokens": 341598387.0, "step": 8956 }, { "epoch": 1.139422465335199, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.85464096069336, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8622139692306519, "num_tokens": 341640253.0, "step": 8957 }, { "epoch": 1.1395496756137895, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.06049728393555, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8869446516036987, "num_tokens": 341675788.0, "step": 8958 }, { "epoch": 1.13967688589238, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.223026275634766, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8593258857727051, "num_tokens": 341716917.0, "step": 8959 }, { "epoch": 1.1398040961709706, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.5171012878418, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8584446907043457, "num_tokens": 341758936.0, "step": 8960 }, { "epoch": 1.139931306449561, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.12063217163086, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8344798684120178, "num_tokens": 341797612.0, "step": 8961 }, { "epoch": 1.1400585167281516, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.64456558227539, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.853875994682312, "num_tokens": 341836061.0, "step": 8962 }, { "epoch": 1.1401857270067421, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.062278747558594, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8562333583831787, "num_tokens": 341875465.0, "step": 8963 }, { "epoch": 1.1403129372853327, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.754608154296875, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8535619974136353, "num_tokens": 341919795.0, "step": 8964 }, { "epoch": 1.1404401475639232, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.916202545166016, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8552135229110718, "num_tokens": 341953291.0, "step": 8965 }, { "epoch": 1.1405673578425137, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.16501998901367, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8774861693382263, "num_tokens": 341988364.0, "step": 8966 }, { "epoch": 1.1406945681211043, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.69520568847656, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8623900413513184, "num_tokens": 342028478.0, "step": 8967 }, { "epoch": 1.1408217783996948, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.42052459716797, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8661136627197266, "num_tokens": 342067348.0, "step": 8968 }, { "epoch": 1.1409489886782853, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.41935729980469, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8544376492500305, "num_tokens": 342105845.0, "step": 8969 }, { "epoch": 1.1410761989568756, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.520668029785156, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8585704565048218, "num_tokens": 342142072.0, "step": 8970 }, { "epoch": 1.1412034092354661, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.55949020385742, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8609926700592041, "num_tokens": 342176592.0, "step": 8971 }, { "epoch": 1.1413306195140567, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.52585220336914, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8625969886779785, "num_tokens": 342211896.0, "step": 8972 }, { "epoch": 1.1414578297926472, "ewc_loss": 0.11279296875, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 9.441375732421875e-05, "grad_norm": 36.58584213256836, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8632369041442871, "num_tokens": 342256078.0, "step": 8973 }, { "epoch": 1.1415850400712377, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 37.02222442626953, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.861860454082489, "num_tokens": 342296766.0, "step": 8974 }, { "epoch": 1.1417122503498283, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 37.0693244934082, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8572944402694702, "num_tokens": 342333689.0, "step": 8975 }, { "epoch": 1.1418394606284188, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.822811126708984, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8699533343315125, "num_tokens": 342369605.0, "step": 8976 }, { "epoch": 1.1419666709070093, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.231773376464844, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8673340678215027, "num_tokens": 342410806.0, "step": 8977 }, { "epoch": 1.1420938811855998, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.92353820800781, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8847538232803345, "num_tokens": 342446957.0, "step": 8978 }, { "epoch": 1.1422210914641904, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.05903244018555, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8457366228103638, "num_tokens": 342480717.0, "step": 8979 }, { "epoch": 1.142348301742781, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.97660827636719, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8667050004005432, "num_tokens": 342520431.0, "step": 8980 }, { "epoch": 1.1424755120213712, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.89420700073242, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8542598485946655, "num_tokens": 342561735.0, "step": 8981 }, { "epoch": 1.1426027222999617, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.938758850097656, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8709069490432739, "num_tokens": 342593899.0, "step": 8982 }, { "epoch": 1.1427299325785523, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.13637924194336, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8584084510803223, "num_tokens": 342628044.0, "step": 8983 }, { "epoch": 1.1428571428571428, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.49152374267578, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8570607304573059, "num_tokens": 342665018.0, "step": 8984 }, { "epoch": 1.1429843531357333, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.877777099609375, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8471580743789673, "num_tokens": 342704832.0, "step": 8985 }, { "epoch": 1.1431115634143239, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.24811935424805, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8667252659797668, "num_tokens": 342741598.0, "step": 8986 }, { "epoch": 1.1432387736929144, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 37.14717483520508, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8557718992233276, "num_tokens": 342777302.0, "step": 8987 }, { "epoch": 1.143365983971505, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.92021179199219, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.838110089302063, "num_tokens": 342809578.0, "step": 8988 }, { "epoch": 1.1434931942500954, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.33049392700195, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8596359491348267, "num_tokens": 342845809.0, "step": 8989 }, { "epoch": 1.143620404528686, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.64079666137695, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8440597653388977, "num_tokens": 342886118.0, "step": 8990 }, { "epoch": 1.1437476148072765, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.33893966674805, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8522301912307739, "num_tokens": 342922517.0, "step": 8991 }, { "epoch": 1.143874825085867, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.03453826904297, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8528597354888916, "num_tokens": 342959741.0, "step": 8992 }, { "epoch": 1.1440020353644575, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.868289947509766, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8711851239204407, "num_tokens": 342991418.0, "step": 8993 }, { "epoch": 1.144129245643048, "ewc_loss": 0.11572265625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.33308792114258, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.87671959400177, "num_tokens": 343024263.0, "step": 8994 }, { "epoch": 1.1442564559216384, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 36.68675994873047, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.858755886554718, "num_tokens": 343066388.0, "step": 8995 }, { "epoch": 1.144383666200229, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.57703399658203, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8605056405067444, "num_tokens": 343107631.0, "step": 8996 }, { "epoch": 1.1445108764788194, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.7429084777832, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8643243312835693, "num_tokens": 343143645.0, "step": 8997 }, { "epoch": 1.14463808675741, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.481346130371094, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8454280495643616, "num_tokens": 343182076.0, "step": 8998 }, { "epoch": 1.1447652970360005, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.51883316040039, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8780644536018372, "num_tokens": 343222808.0, "step": 8999 }, { "epoch": 1.144892507314591, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.539642333984375, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.85501629114151, "num_tokens": 343268229.0, "step": 9000 }, { "epoch": 1.1450197175931816, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.68094253540039, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.849983811378479, "num_tokens": 343304459.0, "step": 9001 }, { "epoch": 1.145146927871772, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.40779495239258, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8198075890541077, "num_tokens": 343343211.0, "step": 9002 }, { "epoch": 1.1452741381503626, "ewc_loss": 0.11328125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.489059448242188e-05, "grad_norm": 36.72385025024414, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8552558422088623, "num_tokens": 343388498.0, "step": 9003 }, { "epoch": 1.1454013484289531, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.30413055419922, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8565922379493713, "num_tokens": 343430101.0, "step": 9004 }, { "epoch": 1.1455285587075437, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.19976043701172, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8465821743011475, "num_tokens": 343466454.0, "step": 9005 }, { "epoch": 1.145655768986134, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.95803451538086, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8752545118331909, "num_tokens": 343506960.0, "step": 9006 }, { "epoch": 1.1457829792647245, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.46531677246094, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.86739182472229, "num_tokens": 343541133.0, "step": 9007 }, { "epoch": 1.145910189543315, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.7176628112793, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8672605752944946, "num_tokens": 343580266.0, "step": 9008 }, { "epoch": 1.1460373998219056, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.97406005859375, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8578760623931885, "num_tokens": 343611691.0, "step": 9009 }, { "epoch": 1.146164610100496, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.637298583984375, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8654000759124756, "num_tokens": 343649355.0, "step": 9010 }, { "epoch": 1.1462918203790866, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.759613037109375, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8720898628234863, "num_tokens": 343681606.0, "step": 9011 }, { "epoch": 1.1464190306576771, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.95726776123047, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8691267371177673, "num_tokens": 343721592.0, "step": 9012 }, { "epoch": 1.1465462409362677, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.810516357421875, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8555134534835815, "num_tokens": 343764084.0, "step": 9013 }, { "epoch": 1.1466734512148582, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.96991729736328, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.8611498475074768, "num_tokens": 343801213.0, "step": 9014 }, { "epoch": 1.1468006614934487, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.96257019042969, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8593713045120239, "num_tokens": 343839761.0, "step": 9015 }, { "epoch": 1.1469278717720393, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.29507064819336, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8600916862487793, "num_tokens": 343877792.0, "step": 9016 }, { "epoch": 1.1470550820506298, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 37.027427673339844, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8636186122894287, "num_tokens": 343915107.0, "step": 9017 }, { "epoch": 1.1471822923292203, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.39984130859375, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8628230094909668, "num_tokens": 343949783.0, "step": 9018 }, { "epoch": 1.1473095026078106, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.971946716308594, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8392877578735352, "num_tokens": 343991058.0, "step": 9019 }, { "epoch": 1.1474367128864011, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.275909423828125, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8595402240753174, "num_tokens": 344030424.0, "step": 9020 }, { "epoch": 1.1475639231649917, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.803348541259766, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8603335618972778, "num_tokens": 344071385.0, "step": 9021 }, { "epoch": 1.1476911334435822, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.26510238647461, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.86765456199646, "num_tokens": 344109725.0, "step": 9022 }, { "epoch": 1.1478183437221727, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.963558197021484, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8652904033660889, "num_tokens": 344143947.0, "step": 9023 }, { "epoch": 1.1479455540007633, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.358154296875, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8483209609985352, "num_tokens": 344179969.0, "step": 9024 }, { "epoch": 1.1480727642793538, "ewc_loss": 0.1142578125, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.5367431640625e-05, "grad_norm": 37.12242126464844, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8690874576568604, "num_tokens": 344212279.0, "step": 9025 }, { "epoch": 1.1481999745579443, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.57184600830078, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8648641109466553, "num_tokens": 344248402.0, "step": 9026 }, { "epoch": 1.1483271848365348, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.56229019165039, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8551221489906311, "num_tokens": 344286557.0, "step": 9027 }, { "epoch": 1.1484543951151254, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.40418243408203, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8596833944320679, "num_tokens": 344331014.0, "step": 9028 }, { "epoch": 1.148581605393716, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.481910705566406, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8551127910614014, "num_tokens": 344373239.0, "step": 9029 }, { "epoch": 1.1487088156723062, "ewc_loss": 0.11474609375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.584426879882812e-05, "grad_norm": 36.56542205810547, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8731038570404053, "num_tokens": 344409871.0, "step": 9030 }, { "epoch": 1.1488360259508967, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.23229217529297, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8498573303222656, "num_tokens": 344452108.0, "step": 9031 }, { "epoch": 1.1489632362294873, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.22481918334961, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.854303777217865, "num_tokens": 344489798.0, "step": 9032 }, { "epoch": 1.1490904465080778, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.87125015258789, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8664456009864807, "num_tokens": 344528194.0, "step": 9033 }, { "epoch": 1.1492176567866683, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.121803283691406, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8445911407470703, "num_tokens": 344559699.0, "step": 9034 }, { "epoch": 1.1493448670652588, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.635860443115234, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8516979217529297, "num_tokens": 344598234.0, "step": 9035 }, { "epoch": 1.1494720773438494, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.201995849609375, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8518359661102295, "num_tokens": 344634780.0, "step": 9036 }, { "epoch": 1.14959928762244, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.69979476928711, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8630499243736267, "num_tokens": 344677167.0, "step": 9037 }, { "epoch": 1.1497264979010304, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.2912712097168, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8450208902359009, "num_tokens": 344713824.0, "step": 9038 }, { "epoch": 1.149853708179621, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.58563232421875, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.845927357673645, "num_tokens": 344755814.0, "step": 9039 }, { "epoch": 1.1499809184582115, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.10205841064453, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8734248876571655, "num_tokens": 344795022.0, "step": 9040 }, { "epoch": 1.150108128736802, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.81346130371094, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8688404560089111, "num_tokens": 344833906.0, "step": 9041 }, { "epoch": 1.1502353390153925, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.6384391784668, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8644314408302307, "num_tokens": 344875975.0, "step": 9042 }, { "epoch": 1.150362549293983, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.175010681152344, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8626418709754944, "num_tokens": 344915433.0, "step": 9043 }, { "epoch": 1.1504897595725734, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.866390228271484, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8631730079650879, "num_tokens": 344949038.0, "step": 9044 }, { "epoch": 1.150616969851164, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.97334289550781, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8603920936584473, "num_tokens": 344988260.0, "step": 9045 }, { "epoch": 1.1507441801297544, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.93403244018555, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8714193105697632, "num_tokens": 345024355.0, "step": 9046 }, { "epoch": 1.150871390408345, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.99094009399414, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8678968548774719, "num_tokens": 345063041.0, "step": 9047 }, { "epoch": 1.1509986006869355, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.171810150146484, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8783696889877319, "num_tokens": 345097813.0, "step": 9048 }, { "epoch": 1.151125810965526, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.37686538696289, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8552786111831665, "num_tokens": 345134534.0, "step": 9049 }, { "epoch": 1.1512530212441165, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.76182556152344, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8630671501159668, "num_tokens": 345171018.0, "step": 9050 }, { "epoch": 1.151380231522707, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.03059005737305, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8711386322975159, "num_tokens": 345208006.0, "step": 9051 }, { "epoch": 1.1515074418012976, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.8339958190918, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.875977098941803, "num_tokens": 345246616.0, "step": 9052 }, { "epoch": 1.1516346520798881, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.074745178222656, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8748608231544495, "num_tokens": 345288095.0, "step": 9053 }, { "epoch": 1.1517618623584787, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.71433639526367, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8470216989517212, "num_tokens": 345333291.0, "step": 9054 }, { "epoch": 1.151889072637069, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.32712936401367, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8591656684875488, "num_tokens": 345375093.0, "step": 9055 }, { "epoch": 1.1520162829156595, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.6010627746582, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8530744314193726, "num_tokens": 345415547.0, "step": 9056 }, { "epoch": 1.15214349319425, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.41817092895508, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8630533218383789, "num_tokens": 345457340.0, "step": 9057 }, { "epoch": 1.1522707034728406, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.905120849609375, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8561586737632751, "num_tokens": 345494711.0, "step": 9058 }, { "epoch": 1.152397913751431, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.12118911743164, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8515254259109497, "num_tokens": 345534249.0, "step": 9059 }, { "epoch": 1.1525251240300216, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.57469177246094, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8594075441360474, "num_tokens": 345570832.0, "step": 9060 }, { "epoch": 1.1526523343086121, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.97612380981445, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8550939559936523, "num_tokens": 345609007.0, "step": 9061 }, { "epoch": 1.1527795445872027, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.4411506652832, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8718874454498291, "num_tokens": 345647979.0, "step": 9062 }, { "epoch": 1.1529067548657932, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.632110595703125e-05, "grad_norm": 36.99193572998047, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8506098985671997, "num_tokens": 345681821.0, "step": 9063 }, { "epoch": 1.1530339651443837, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.35956954956055, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8479602336883545, "num_tokens": 345724274.0, "step": 9064 }, { "epoch": 1.1531611754229742, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.47895431518555, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8768315315246582, "num_tokens": 345763558.0, "step": 9065 }, { "epoch": 1.1532883857015648, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.55012130737305, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8532630801200867, "num_tokens": 345798486.0, "step": 9066 }, { "epoch": 1.1534155959801553, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 36.45974349975586, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8556034564971924, "num_tokens": 345839379.0, "step": 9067 }, { "epoch": 1.1535428062587456, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.387630462646484, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8549972176551819, "num_tokens": 345876675.0, "step": 9068 }, { "epoch": 1.1536700165373361, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.89799118041992, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.879106342792511, "num_tokens": 345913482.0, "step": 9069 }, { "epoch": 1.1537972268159267, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.73473358154297, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.855290412902832, "num_tokens": 345947671.0, "step": 9070 }, { "epoch": 1.1539244370945172, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.11011505126953, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8623000383377075, "num_tokens": 345981520.0, "step": 9071 }, { "epoch": 1.1540516473731077, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.001380920410156, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8412019610404968, "num_tokens": 346020050.0, "step": 9072 }, { "epoch": 1.1541788576516983, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.94525909423828, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8506330251693726, "num_tokens": 346050000.0, "step": 9073 }, { "epoch": 1.1543060679302888, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.00651168823242, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8532203435897827, "num_tokens": 346085038.0, "step": 9074 }, { "epoch": 1.1544332782088793, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.83934783935547, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8697666525840759, "num_tokens": 346125871.0, "step": 9075 }, { "epoch": 1.1545604884874698, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.29331970214844, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8676877021789551, "num_tokens": 346160460.0, "step": 9076 }, { "epoch": 1.1546876987660604, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.571937561035156, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8485754728317261, "num_tokens": 346201621.0, "step": 9077 }, { "epoch": 1.154814909044651, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.39900588989258, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8623517751693726, "num_tokens": 346242680.0, "step": 9078 }, { "epoch": 1.1549421193232412, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.64720916748047, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8506594896316528, "num_tokens": 346275774.0, "step": 9079 }, { "epoch": 1.1550693296018317, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.13450241088867, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8605054616928101, "num_tokens": 346317648.0, "step": 9080 }, { "epoch": 1.1551965398804223, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.10506057739258, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8624974489212036, "num_tokens": 346357736.0, "step": 9081 }, { "epoch": 1.1553237501590128, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.88801574707031, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.857353925704956, "num_tokens": 346397961.0, "step": 9082 }, { "epoch": 1.1554509604376033, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.05023956298828, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8712998032569885, "num_tokens": 346438268.0, "step": 9083 }, { "epoch": 1.1555781707161938, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.83970642089844, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8493002653121948, "num_tokens": 346479024.0, "step": 9084 }, { "epoch": 1.1557053809947844, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.14313888549805, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8657515048980713, "num_tokens": 346523526.0, "step": 9085 }, { "epoch": 1.155832591273375, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.74074935913086, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.869354248046875, "num_tokens": 346561985.0, "step": 9086 }, { "epoch": 1.1559598015519654, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.25173568725586, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8470234274864197, "num_tokens": 346605600.0, "step": 9087 }, { "epoch": 1.156087011830556, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.8851432800293, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8724702596664429, "num_tokens": 346640798.0, "step": 9088 }, { "epoch": 1.1562142221091465, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.643802642822266, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8547803163528442, "num_tokens": 346678483.0, "step": 9089 }, { "epoch": 1.156341432387737, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.194236755371094, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8755520582199097, "num_tokens": 346714604.0, "step": 9090 }, { "epoch": 1.1564686426663275, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.739013671875, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.87009197473526, "num_tokens": 346747203.0, "step": 9091 }, { "epoch": 1.156595852944918, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.0737419128418, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.85737144947052, "num_tokens": 346790650.0, "step": 9092 }, { "epoch": 1.1567230632235084, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.73640441894531, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8813830614089966, "num_tokens": 346822872.0, "step": 9093 }, { "epoch": 1.156850273502099, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.74055099487305, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8666135668754578, "num_tokens": 346857999.0, "step": 9094 }, { "epoch": 1.1569774837806894, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.30204772949219, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8659288883209229, "num_tokens": 346891804.0, "step": 9095 }, { "epoch": 1.15710469405928, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.71664047241211, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8581466674804688, "num_tokens": 346931554.0, "step": 9096 }, { "epoch": 1.1572319043378705, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.07857894897461, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8471295237541199, "num_tokens": 346965214.0, "step": 9097 }, { "epoch": 1.157359114616461, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.68655014038086, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8721210956573486, "num_tokens": 347001219.0, "step": 9098 }, { "epoch": 1.1574863248950515, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.84002685546875, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8415132761001587, "num_tokens": 347043212.0, "step": 9099 }, { "epoch": 1.157613535173642, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.883419036865234, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8718612790107727, "num_tokens": 347082249.0, "step": 9100 }, { "epoch": 1.1577407454522326, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.72157669067383, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8569634556770325, "num_tokens": 347119286.0, "step": 9101 }, { "epoch": 1.1578679557308231, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.800689697265625, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.873882532119751, "num_tokens": 347155549.0, "step": 9102 }, { "epoch": 1.1579951660094137, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.03592300415039, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8613360524177551, "num_tokens": 347196846.0, "step": 9103 }, { "epoch": 1.158122376288004, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.31041717529297, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8742990493774414, "num_tokens": 347234028.0, "step": 9104 }, { "epoch": 1.1582495865665945, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.41301345825195, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8705575466156006, "num_tokens": 347276931.0, "step": 9105 }, { "epoch": 1.158376796845185, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.69034194946289, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8555740118026733, "num_tokens": 347317139.0, "step": 9106 }, { "epoch": 1.1585040071237755, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.84290313720703, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8623631596565247, "num_tokens": 347355718.0, "step": 9107 }, { "epoch": 1.158631217402366, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.841064453125, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8492653965950012, "num_tokens": 347393403.0, "step": 9108 }, { "epoch": 1.1587584276809566, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.900672912597656, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8671278953552246, "num_tokens": 347433736.0, "step": 9109 }, { "epoch": 1.1588856379595471, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.9835205078125, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8628553152084351, "num_tokens": 347474574.0, "step": 9110 }, { "epoch": 1.1590128482381377, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.8787956237793, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8666268587112427, "num_tokens": 347518785.0, "step": 9111 }, { "epoch": 1.1591400585167282, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.322635650634766, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.84690260887146, "num_tokens": 347557518.0, "step": 9112 }, { "epoch": 1.1592672687953187, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.82875442504883, "learning_rate": 1e-06, "loss": 0.6219, "mean_token_accuracy": 0.8307647705078125, "num_tokens": 347600094.0, "step": 9113 }, { "epoch": 1.1593944790739092, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.488868713378906, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8585779666900635, "num_tokens": 347633709.0, "step": 9114 }, { "epoch": 1.1595216893524998, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.79247283935547, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8679351210594177, "num_tokens": 347670027.0, "step": 9115 }, { "epoch": 1.1596488996310903, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.886451721191406, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8653149604797363, "num_tokens": 347707488.0, "step": 9116 }, { "epoch": 1.1597761099096806, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.147335052490234, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8551397919654846, "num_tokens": 347746030.0, "step": 9117 }, { "epoch": 1.1599033201882711, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.65218734741211, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8530385494232178, "num_tokens": 347783409.0, "step": 9118 }, { "epoch": 1.1600305304668617, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.51970672607422, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8673319816589355, "num_tokens": 347821335.0, "step": 9119 }, { "epoch": 1.1601577407454522, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.76753234863281, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8691345453262329, "num_tokens": 347857590.0, "step": 9120 }, { "epoch": 1.1602849510240427, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.51023483276367, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8718166351318359, "num_tokens": 347897000.0, "step": 9121 }, { "epoch": 1.1604121613026332, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.690311431884766, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8590166568756104, "num_tokens": 347928947.0, "step": 9122 }, { "epoch": 1.1605393715812238, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.27909851074219, "learning_rate": 1e-06, "loss": 0.6535, "mean_token_accuracy": 0.8288458585739136, "num_tokens": 347959294.0, "step": 9123 }, { "epoch": 1.1606665818598143, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.83264923095703, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8663384914398193, "num_tokens": 347993103.0, "step": 9124 }, { "epoch": 1.1607937921384048, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.181819915771484, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8739807605743408, "num_tokens": 348028412.0, "step": 9125 }, { "epoch": 1.1609210024169954, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.7719841003418, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8429482579231262, "num_tokens": 348071429.0, "step": 9126 }, { "epoch": 1.161048212695586, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.668453216552734, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8656575679779053, "num_tokens": 348113222.0, "step": 9127 }, { "epoch": 1.1611754229741762, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.611549377441406, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.8593989014625549, "num_tokens": 348150188.0, "step": 9128 }, { "epoch": 1.1613026332527667, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.78678894042969, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8624945878982544, "num_tokens": 348188547.0, "step": 9129 }, { "epoch": 1.1614298435313573, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.68842315673828, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8557583093643188, "num_tokens": 348228624.0, "step": 9130 }, { "epoch": 1.1615570538099478, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.726139068603516, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8567416667938232, "num_tokens": 348264932.0, "step": 9131 }, { "epoch": 1.1616842640885383, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.72815704345703, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8619334697723389, "num_tokens": 348307680.0, "step": 9132 }, { "epoch": 1.1618114743671288, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.54075241088867, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8752211928367615, "num_tokens": 348346449.0, "step": 9133 }, { "epoch": 1.1619386846457194, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.86459732055664, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.871131956577301, "num_tokens": 348386393.0, "step": 9134 }, { "epoch": 1.16206589492431, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.47804641723633, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8553020358085632, "num_tokens": 348429458.0, "step": 9135 }, { "epoch": 1.1621931052029004, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.961490631103516, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8474140167236328, "num_tokens": 348471365.0, "step": 9136 }, { "epoch": 1.162320315481491, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.48282241821289, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8446118235588074, "num_tokens": 348513425.0, "step": 9137 }, { "epoch": 1.1624475257600815, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.332420349121094, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.85854572057724, "num_tokens": 348551181.0, "step": 9138 }, { "epoch": 1.162574736038672, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.372684478759766, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8812758326530457, "num_tokens": 348583611.0, "step": 9139 }, { "epoch": 1.1627019463172625, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.34965896606445, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8582755923271179, "num_tokens": 348624281.0, "step": 9140 }, { "epoch": 1.162829156595853, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.681068420410156, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8579366207122803, "num_tokens": 348661416.0, "step": 9141 }, { "epoch": 1.1629563668744434, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.6466178894043, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8396753668785095, "num_tokens": 348703907.0, "step": 9142 }, { "epoch": 1.163083577153034, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.32955551147461, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8717799186706543, "num_tokens": 348735617.0, "step": 9143 }, { "epoch": 1.1632107874316244, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.75575256347656, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8579045534133911, "num_tokens": 348773552.0, "step": 9144 }, { "epoch": 1.163337997710215, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.01769256591797, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8612690567970276, "num_tokens": 348811274.0, "step": 9145 }, { "epoch": 1.1634652079888055, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.716339111328125, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8487101793289185, "num_tokens": 348847773.0, "step": 9146 }, { "epoch": 1.163592418267396, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.28532409667969, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8506667613983154, "num_tokens": 348886356.0, "step": 9147 }, { "epoch": 1.1637196285459865, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.47344207763672, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8709050416946411, "num_tokens": 348922892.0, "step": 9148 }, { "epoch": 1.163846838824577, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.67826461791992, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8782214522361755, "num_tokens": 348954924.0, "step": 9149 }, { "epoch": 1.1639740491031676, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 36.7925910949707, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8645014762878418, "num_tokens": 348997335.0, "step": 9150 }, { "epoch": 1.1641012593817581, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.28887939453125, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8492973446846008, "num_tokens": 349038221.0, "step": 9151 }, { "epoch": 1.1642284696603487, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.3337516784668, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8550388813018799, "num_tokens": 349076559.0, "step": 9152 }, { "epoch": 1.164355679938939, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.545536041259766, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8383303880691528, "num_tokens": 349112264.0, "step": 9153 }, { "epoch": 1.1644828902175295, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.276004791259766, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8711935877799988, "num_tokens": 349146571.0, "step": 9154 }, { "epoch": 1.16461010049612, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.7681884765625, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8667322397232056, "num_tokens": 349183674.0, "step": 9155 }, { "epoch": 1.1647373107747105, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.3303108215332, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8643544912338257, "num_tokens": 349227042.0, "step": 9156 }, { "epoch": 1.164864521053301, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.90445327758789, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8564320206642151, "num_tokens": 349264072.0, "step": 9157 }, { "epoch": 1.1649917313318916, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.34083557128906, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8605871796607971, "num_tokens": 349304427.0, "step": 9158 }, { "epoch": 1.1651189416104821, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.62715148925781, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8586496114730835, "num_tokens": 349346708.0, "step": 9159 }, { "epoch": 1.1652461518890727, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.34174346923828, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8587726354598999, "num_tokens": 349387716.0, "step": 9160 }, { "epoch": 1.1653733621676632, "ewc_loss": 0.115234375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.679794311523438e-05, "grad_norm": 37.1529541015625, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8574944138526917, "num_tokens": 349425757.0, "step": 9161 }, { "epoch": 1.1655005724462537, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.72172546386719, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8677644729614258, "num_tokens": 349472097.0, "step": 9162 }, { "epoch": 1.1656277827248442, "ewc_loss": 0.1162109375, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 9.72747802734375e-05, "grad_norm": 37.00514221191406, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8564984798431396, "num_tokens": 349513809.0, "step": 9163 }, { "epoch": 1.1657549930034348, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.20030975341797, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8738512992858887, "num_tokens": 349544906.0, "step": 9164 }, { "epoch": 1.1658822032820253, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.81308364868164, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8547033667564392, "num_tokens": 349582529.0, "step": 9165 }, { "epoch": 1.1660094135606156, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.67569351196289, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.855644941329956, "num_tokens": 349624871.0, "step": 9166 }, { "epoch": 1.1661366238392061, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.99422073364258, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8238975405693054, "num_tokens": 349660783.0, "step": 9167 }, { "epoch": 1.1662638341177967, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.58720016479492, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8703609704971313, "num_tokens": 349698430.0, "step": 9168 }, { "epoch": 1.1663910443963872, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.0710563659668, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8524765372276306, "num_tokens": 349734981.0, "step": 9169 }, { "epoch": 1.1665182546749777, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.2049446105957, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8540731072425842, "num_tokens": 349774267.0, "step": 9170 }, { "epoch": 1.1666454649535682, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.217227935791016, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8570478558540344, "num_tokens": 349811400.0, "step": 9171 }, { "epoch": 1.1667726752321588, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.65080261230469, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8507548570632935, "num_tokens": 349850031.0, "step": 9172 }, { "epoch": 1.1668998855107493, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.782405853271484, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8625087141990662, "num_tokens": 349889967.0, "step": 9173 }, { "epoch": 1.1670270957893398, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.78636932373047, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8546873331069946, "num_tokens": 349930276.0, "step": 9174 }, { "epoch": 1.1671543060679304, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.904972076416016, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8552690744400024, "num_tokens": 349969889.0, "step": 9175 }, { "epoch": 1.1672815163465209, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.596778869628906, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8585953116416931, "num_tokens": 350005729.0, "step": 9176 }, { "epoch": 1.1674087266251112, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.17136001586914, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8668177127838135, "num_tokens": 350043130.0, "step": 9177 }, { "epoch": 1.1675359369037017, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.07634353637695, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8784918785095215, "num_tokens": 350081233.0, "step": 9178 }, { "epoch": 1.1676631471822922, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.36571502685547, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8459742069244385, "num_tokens": 350119403.0, "step": 9179 }, { "epoch": 1.1677903574608828, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.067054748535156, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8480933904647827, "num_tokens": 350159099.0, "step": 9180 }, { "epoch": 1.1679175677394733, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.33926010131836, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.859918475151062, "num_tokens": 350195642.0, "step": 9181 }, { "epoch": 1.1680447780180638, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 36.91764831542969, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8678003549575806, "num_tokens": 350236721.0, "step": 9182 }, { "epoch": 1.1681719882966544, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.560157775878906, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8534212708473206, "num_tokens": 350272691.0, "step": 9183 }, { "epoch": 1.168299198575245, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.96295166015625, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8578716516494751, "num_tokens": 350318299.0, "step": 9184 }, { "epoch": 1.1684264088538354, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.22330856323242, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.859282374382019, "num_tokens": 350360973.0, "step": 9185 }, { "epoch": 1.168553619132426, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.22441864013672, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8670793771743774, "num_tokens": 350400516.0, "step": 9186 }, { "epoch": 1.1686808294110165, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.472007751464844, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.85611891746521, "num_tokens": 350438147.0, "step": 9187 }, { "epoch": 1.168808039689607, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.969215393066406, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8644412755966187, "num_tokens": 350481604.0, "step": 9188 }, { "epoch": 1.1689352499681975, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.44108963012695, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8807234764099121, "num_tokens": 350518225.0, "step": 9189 }, { "epoch": 1.169062460246788, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.11061477661133, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.860005259513855, "num_tokens": 350553752.0, "step": 9190 }, { "epoch": 1.1691896705253784, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.185150146484375, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8659725189208984, "num_tokens": 350593676.0, "step": 9191 }, { "epoch": 1.169316880803969, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.294822692871094, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8573977947235107, "num_tokens": 350633618.0, "step": 9192 }, { "epoch": 1.1694440910825594, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.160301208496094, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8578873872756958, "num_tokens": 350678221.0, "step": 9193 }, { "epoch": 1.16957130136115, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.72605514526367, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.87026047706604, "num_tokens": 350718258.0, "step": 9194 }, { "epoch": 1.1696985116397405, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.577056884765625, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.869473934173584, "num_tokens": 350757370.0, "step": 9195 }, { "epoch": 1.169825721918331, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.706111907958984, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.861777126789093, "num_tokens": 350801394.0, "step": 9196 }, { "epoch": 1.1699529321969215, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.192203521728516, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8511512279510498, "num_tokens": 350840635.0, "step": 9197 }, { "epoch": 1.170080142475512, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 38.265567779541016, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.875257134437561, "num_tokens": 350879616.0, "step": 9198 }, { "epoch": 1.1702073527541026, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.29547882080078, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8601429462432861, "num_tokens": 350921710.0, "step": 9199 }, { "epoch": 1.1703345630326931, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.97013854980469, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8483863472938538, "num_tokens": 350964218.0, "step": 9200 }, { "epoch": 1.1704617733112836, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 37.13101577758789, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8697909116744995, "num_tokens": 351004511.0, "step": 9201 }, { "epoch": 1.170588983589874, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 38.1832389831543, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8544871211051941, "num_tokens": 351044583.0, "step": 9202 }, { "epoch": 1.1707161938684645, "ewc_loss": 0.11669921875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.775161743164062e-05, "grad_norm": 36.74300003051758, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8440475463867188, "num_tokens": 351083613.0, "step": 9203 }, { "epoch": 1.170843404147055, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.68429946899414, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8523952960968018, "num_tokens": 351117412.0, "step": 9204 }, { "epoch": 1.1709706144256455, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.282405853271484, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8450002670288086, "num_tokens": 351154779.0, "step": 9205 }, { "epoch": 1.171097824704236, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.41181182861328, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8699795007705688, "num_tokens": 351191412.0, "step": 9206 }, { "epoch": 1.1712250349828266, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.51771926879883, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8638898134231567, "num_tokens": 351235465.0, "step": 9207 }, { "epoch": 1.1713522452614171, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.400638580322266, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8651201725006104, "num_tokens": 351279326.0, "step": 9208 }, { "epoch": 1.1714794555400077, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.42469787597656, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.877561628818512, "num_tokens": 351316678.0, "step": 9209 }, { "epoch": 1.1716066658185982, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.29218292236328, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8558376431465149, "num_tokens": 351352641.0, "step": 9210 }, { "epoch": 1.1717338760971887, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.56896209716797, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8501130938529968, "num_tokens": 351385845.0, "step": 9211 }, { "epoch": 1.1718610863757792, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.21142578125, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8671911358833313, "num_tokens": 351425220.0, "step": 9212 }, { "epoch": 1.1719882966543698, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.48981857299805, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8555806279182434, "num_tokens": 351467515.0, "step": 9213 }, { "epoch": 1.1721155069329603, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.350982666015625, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8572177290916443, "num_tokens": 351507571.0, "step": 9214 }, { "epoch": 1.1722427172115506, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.310081481933594, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.856918454170227, "num_tokens": 351546150.0, "step": 9215 }, { "epoch": 1.1723699274901411, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.44712829589844, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.852032482624054, "num_tokens": 351588574.0, "step": 9216 }, { "epoch": 1.1724971377687317, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.12913131713867, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8595903515815735, "num_tokens": 351626914.0, "step": 9217 }, { "epoch": 1.1726243480473222, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.49527359008789, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8502777814865112, "num_tokens": 351663574.0, "step": 9218 }, { "epoch": 1.1727515583259127, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.44681167602539, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8431146144866943, "num_tokens": 351703275.0, "step": 9219 }, { "epoch": 1.1728787686045032, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.02823257446289, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8500397801399231, "num_tokens": 351745713.0, "step": 9220 }, { "epoch": 1.1730059788830938, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.44939041137695, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8438930511474609, "num_tokens": 351780789.0, "step": 9221 }, { "epoch": 1.1731331891616843, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.42555236816406, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8691065907478333, "num_tokens": 351820352.0, "step": 9222 }, { "epoch": 1.1732603994402748, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.29993438720703, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8683478832244873, "num_tokens": 351860011.0, "step": 9223 }, { "epoch": 1.1733876097188654, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.32801055908203, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.852100133895874, "num_tokens": 351898742.0, "step": 9224 }, { "epoch": 1.1735148199974559, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.45276641845703, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8475697636604309, "num_tokens": 351940201.0, "step": 9225 }, { "epoch": 1.1736420302760462, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.22865295410156, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8558073043823242, "num_tokens": 351971041.0, "step": 9226 }, { "epoch": 1.1737692405546367, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.59189987182617, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8413209915161133, "num_tokens": 352013002.0, "step": 9227 }, { "epoch": 1.1738964508332272, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.353328704833984, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8576543927192688, "num_tokens": 352051435.0, "step": 9228 }, { "epoch": 1.1740236611118178, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.52464294433594, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.865465521812439, "num_tokens": 352093295.0, "step": 9229 }, { "epoch": 1.1741508713904083, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.58766174316406, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.854446291923523, "num_tokens": 352136401.0, "step": 9230 }, { "epoch": 1.1742780816689988, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.835723876953125, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.870703399181366, "num_tokens": 352178696.0, "step": 9231 }, { "epoch": 1.1744052919475894, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.05047607421875, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8413386940956116, "num_tokens": 352214759.0, "step": 9232 }, { "epoch": 1.1745325022261799, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.93980407714844, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.872795045375824, "num_tokens": 352250520.0, "step": 9233 }, { "epoch": 1.1746597125047704, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.29523468017578, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8562472462654114, "num_tokens": 352288773.0, "step": 9234 }, { "epoch": 1.174786922783361, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.34528350830078, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8666121959686279, "num_tokens": 352326548.0, "step": 9235 }, { "epoch": 1.1749141330619515, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.51935958862305, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.869200587272644, "num_tokens": 352366537.0, "step": 9236 }, { "epoch": 1.175041343340542, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.168888092041016, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8423531651496887, "num_tokens": 352402390.0, "step": 9237 }, { "epoch": 1.1751685536191325, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.613582611083984, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8557590246200562, "num_tokens": 352442357.0, "step": 9238 }, { "epoch": 1.175295763897723, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.22713851928711, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8565710783004761, "num_tokens": 352476884.0, "step": 9239 }, { "epoch": 1.1754229741763134, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.5615348815918, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8590694665908813, "num_tokens": 352513448.0, "step": 9240 }, { "epoch": 1.175550184454904, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.287925720214844, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.878616452217102, "num_tokens": 352548420.0, "step": 9241 }, { "epoch": 1.1756773947334944, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.546932220458984, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8616941571235657, "num_tokens": 352592157.0, "step": 9242 }, { "epoch": 1.175804605012085, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.324039459228516, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8515455722808838, "num_tokens": 352626427.0, "step": 9243 }, { "epoch": 1.1759318152906755, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.35568618774414, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8696616888046265, "num_tokens": 352667075.0, "step": 9244 }, { "epoch": 1.176059025569266, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.717227935791016, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8569486141204834, "num_tokens": 352707114.0, "step": 9245 }, { "epoch": 1.1761862358478565, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.15180969238281, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8645313382148743, "num_tokens": 352738064.0, "step": 9246 }, { "epoch": 1.176313446126447, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.7205924987793, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8434158563613892, "num_tokens": 352779964.0, "step": 9247 }, { "epoch": 1.1764406564050376, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.882965087890625, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8685303926467896, "num_tokens": 352819574.0, "step": 9248 }, { "epoch": 1.1765678666836281, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.63719940185547, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8691598176956177, "num_tokens": 352859151.0, "step": 9249 }, { "epoch": 1.1766950769622184, "ewc_loss": 0.1171875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.822845458984375e-05, "grad_norm": 37.26757049560547, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8367738723754883, "num_tokens": 352901119.0, "step": 9250 }, { "epoch": 1.176822287240809, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.502960205078125, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.857790470123291, "num_tokens": 352945878.0, "step": 9251 }, { "epoch": 1.1769494975193995, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 36.95500564575195, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8539668321609497, "num_tokens": 352979819.0, "step": 9252 }, { "epoch": 1.17707670779799, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.00772476196289, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8488335609436035, "num_tokens": 353023552.0, "step": 9253 }, { "epoch": 1.1772039180765805, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.38887023925781, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8613201379776001, "num_tokens": 353058178.0, "step": 9254 }, { "epoch": 1.177331128355171, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.029361724853516, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8684622049331665, "num_tokens": 353091428.0, "step": 9255 }, { "epoch": 1.1774583386337616, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.428443908691406, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8635193109512329, "num_tokens": 353131943.0, "step": 9256 }, { "epoch": 1.1775855489123521, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.2362174987793, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8505280017852783, "num_tokens": 353171213.0, "step": 9257 }, { "epoch": 1.1777127591909426, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.49850082397461, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.868216335773468, "num_tokens": 353205057.0, "step": 9258 }, { "epoch": 1.1778399694695332, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.09701919555664, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8455487489700317, "num_tokens": 353251164.0, "step": 9259 }, { "epoch": 1.1779671797481237, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.4309196472168, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.864717960357666, "num_tokens": 353288202.0, "step": 9260 }, { "epoch": 1.1780943900267142, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.852413177490234, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8578330278396606, "num_tokens": 353327396.0, "step": 9261 }, { "epoch": 1.1782216003053048, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.6009635925293, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8458734154701233, "num_tokens": 353366512.0, "step": 9262 }, { "epoch": 1.1783488105838953, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 36.933876037597656, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8512231111526489, "num_tokens": 353408546.0, "step": 9263 }, { "epoch": 1.1784760208624856, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.496009826660156, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8687212467193604, "num_tokens": 353444950.0, "step": 9264 }, { "epoch": 1.1786032311410761, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 36.9762077331543, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8658413887023926, "num_tokens": 353483306.0, "step": 9265 }, { "epoch": 1.1787304414196667, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.26180648803711, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8575482368469238, "num_tokens": 353523038.0, "step": 9266 }, { "epoch": 1.1788576516982572, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.46986389160156, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8697302937507629, "num_tokens": 353564266.0, "step": 9267 }, { "epoch": 1.1789848619768477, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 36.9192008972168, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8633885383605957, "num_tokens": 353603558.0, "step": 9268 }, { "epoch": 1.1791120722554382, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.57506561279297, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8662744760513306, "num_tokens": 353648097.0, "step": 9269 }, { "epoch": 1.1792392825340288, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.133628845214844, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8445929884910583, "num_tokens": 353688970.0, "step": 9270 }, { "epoch": 1.1793664928126193, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.42218780517578, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8483560681343079, "num_tokens": 353732505.0, "step": 9271 }, { "epoch": 1.1794937030912098, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.42650604248047, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.867356538772583, "num_tokens": 353767460.0, "step": 9272 }, { "epoch": 1.1796209133698004, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.18532180786133, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8681000471115112, "num_tokens": 353801951.0, "step": 9273 }, { "epoch": 1.1797481236483909, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.48146057128906, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8665145635604858, "num_tokens": 353836815.0, "step": 9274 }, { "epoch": 1.1798753339269812, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.09971237182617, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.852467954158783, "num_tokens": 353876296.0, "step": 9275 }, { "epoch": 1.1800025442055717, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.758460998535156, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8409011363983154, "num_tokens": 353911792.0, "step": 9276 }, { "epoch": 1.1801297544841622, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.483154296875, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8482728004455566, "num_tokens": 353949532.0, "step": 9277 }, { "epoch": 1.1802569647627528, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.502628326416016, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8458992838859558, "num_tokens": 353988622.0, "step": 9278 }, { "epoch": 1.1803841750413433, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.21357727050781, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8625207543373108, "num_tokens": 354028136.0, "step": 9279 }, { "epoch": 1.1805113853199338, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.61528778076172, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8605822324752808, "num_tokens": 354065029.0, "step": 9280 }, { "epoch": 1.1806385955985244, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.1710319519043, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8696848154067993, "num_tokens": 354099279.0, "step": 9281 }, { "epoch": 1.1807658058771149, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.425472259521484, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8537466526031494, "num_tokens": 354138870.0, "step": 9282 }, { "epoch": 1.1808930161557054, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.11005401611328, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8461828231811523, "num_tokens": 354176498.0, "step": 9283 }, { "epoch": 1.181020226434296, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.64886474609375, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8402724862098694, "num_tokens": 354210063.0, "step": 9284 }, { "epoch": 1.1811474367128865, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.3092155456543, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8522724509239197, "num_tokens": 354248095.0, "step": 9285 }, { "epoch": 1.181274646991477, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.33420944213867, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8565578460693359, "num_tokens": 354286897.0, "step": 9286 }, { "epoch": 1.1814018572700675, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.49948501586914, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8687230944633484, "num_tokens": 354330685.0, "step": 9287 }, { "epoch": 1.181529067548658, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.58675003051758, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8509877324104309, "num_tokens": 354371120.0, "step": 9288 }, { "epoch": 1.1816562778272484, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.201419830322266, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8563616871833801, "num_tokens": 354410825.0, "step": 9289 }, { "epoch": 1.1817834881058389, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.572731018066406, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8571884632110596, "num_tokens": 354453623.0, "step": 9290 }, { "epoch": 1.1819106983844294, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.65443801879883, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8578399419784546, "num_tokens": 354491854.0, "step": 9291 }, { "epoch": 1.18203790866302, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.37300491333008, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8456861972808838, "num_tokens": 354530788.0, "step": 9292 }, { "epoch": 1.1821651189416105, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.543495178222656, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8785125017166138, "num_tokens": 354568576.0, "step": 9293 }, { "epoch": 1.182292329220201, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.664520263671875, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8595223426818848, "num_tokens": 354611307.0, "step": 9294 }, { "epoch": 1.1824195394987915, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.473995208740234, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8661431074142456, "num_tokens": 354648810.0, "step": 9295 }, { "epoch": 1.182546749777382, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.596378326416016, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8610323667526245, "num_tokens": 354688411.0, "step": 9296 }, { "epoch": 1.1826739600559726, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.6175422668457, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8576774001121521, "num_tokens": 354734700.0, "step": 9297 }, { "epoch": 1.1828011703345631, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.05574417114258, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8561666011810303, "num_tokens": 354765517.0, "step": 9298 }, { "epoch": 1.1829283806131534, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.65000915527344, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8638409376144409, "num_tokens": 354808563.0, "step": 9299 }, { "epoch": 1.183055590891744, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.25320816040039, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8387436866760254, "num_tokens": 354846618.0, "step": 9300 }, { "epoch": 1.1831828011703345, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.76753234863281, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8614745140075684, "num_tokens": 354881293.0, "step": 9301 }, { "epoch": 1.183310011448925, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.39887619018555, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8558564186096191, "num_tokens": 354925239.0, "step": 9302 }, { "epoch": 1.1834372217275155, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.18042755126953, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.845174252986908, "num_tokens": 354967836.0, "step": 9303 }, { "epoch": 1.183564432006106, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.470455169677734, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8415846824645996, "num_tokens": 355003386.0, "step": 9304 }, { "epoch": 1.1836916422846966, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.422332763671875, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8373464345932007, "num_tokens": 355043391.0, "step": 9305 }, { "epoch": 1.1838188525632871, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.274391174316406, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8593795299530029, "num_tokens": 355082820.0, "step": 9306 }, { "epoch": 1.1839460628418776, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.11967086791992, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8790868520736694, "num_tokens": 355117531.0, "step": 9307 }, { "epoch": 1.1840732731204682, "ewc_loss": 0.11767578125, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.870529174804688e-05, "grad_norm": 37.325340270996094, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8469681739807129, "num_tokens": 355159485.0, "step": 9308 }, { "epoch": 1.1842004833990587, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.67217254638672, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8535621166229248, "num_tokens": 355206051.0, "step": 9309 }, { "epoch": 1.1843276936776492, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.40922927856445, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8519467711448669, "num_tokens": 355242933.0, "step": 9310 }, { "epoch": 1.1844549039562398, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.839412689208984, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8706508874893188, "num_tokens": 355286311.0, "step": 9311 }, { "epoch": 1.1845821142348303, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.126338958740234, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8583593368530273, "num_tokens": 355327844.0, "step": 9312 }, { "epoch": 1.1847093245134206, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.03167724609375, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8663342595100403, "num_tokens": 355368977.0, "step": 9313 }, { "epoch": 1.1848365347920111, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.109275817871094, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8472895622253418, "num_tokens": 355411479.0, "step": 9314 }, { "epoch": 1.1849637450706016, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.47397232055664, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8774069547653198, "num_tokens": 355447370.0, "step": 9315 }, { "epoch": 1.1850909553491922, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.80724334716797, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8552042245864868, "num_tokens": 355490335.0, "step": 9316 }, { "epoch": 1.1852181656277827, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.22532272338867, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.859892725944519, "num_tokens": 355527843.0, "step": 9317 }, { "epoch": 1.1853453759063732, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.837928771972656, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8566323518753052, "num_tokens": 355566094.0, "step": 9318 }, { "epoch": 1.1854725861849638, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.421897888183594, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8670946955680847, "num_tokens": 355609505.0, "step": 9319 }, { "epoch": 1.1855997964635543, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.622222900390625, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8338110446929932, "num_tokens": 355650930.0, "step": 9320 }, { "epoch": 1.1857270067421448, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.56526565551758, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8697881698608398, "num_tokens": 355684075.0, "step": 9321 }, { "epoch": 1.1858542170207353, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.80943298339844, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.862790584564209, "num_tokens": 355723567.0, "step": 9322 }, { "epoch": 1.1859814272993259, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.46371841430664, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8446757197380066, "num_tokens": 355764915.0, "step": 9323 }, { "epoch": 1.1861086375779162, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.01123809814453, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8601182103157043, "num_tokens": 355804950.0, "step": 9324 }, { "epoch": 1.1862358478565067, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.22900390625, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8655624389648438, "num_tokens": 355844155.0, "step": 9325 }, { "epoch": 1.1863630581350972, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.84286117553711, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8750213384628296, "num_tokens": 355883971.0, "step": 9326 }, { "epoch": 1.1864902684136878, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.465938568115234, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8541584014892578, "num_tokens": 355922271.0, "step": 9327 }, { "epoch": 1.1866174786922783, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.51332473754883, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8565776348114014, "num_tokens": 355961087.0, "step": 9328 }, { "epoch": 1.1867446889708688, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.49623489379883, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8634429574012756, "num_tokens": 355999553.0, "step": 9329 }, { "epoch": 1.1868718992494594, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.85643005371094, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.863539457321167, "num_tokens": 356038503.0, "step": 9330 }, { "epoch": 1.1869991095280499, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.714263916015625, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8705798983573914, "num_tokens": 356078538.0, "step": 9331 }, { "epoch": 1.1871263198066404, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.75603485107422, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8711521625518799, "num_tokens": 356115031.0, "step": 9332 }, { "epoch": 1.187253530085231, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.746124267578125, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8734544515609741, "num_tokens": 356154751.0, "step": 9333 }, { "epoch": 1.1873807403638215, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.49911117553711, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8441075086593628, "num_tokens": 356192105.0, "step": 9334 }, { "epoch": 1.187507950642412, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.74546432495117, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8472900986671448, "num_tokens": 356224964.0, "step": 9335 }, { "epoch": 1.1876351609210025, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.55210876464844, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8649430274963379, "num_tokens": 356261565.0, "step": 9336 }, { "epoch": 1.187762371199593, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.984710693359375, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8600890040397644, "num_tokens": 356301886.0, "step": 9337 }, { "epoch": 1.1878895814781834, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.8294677734375, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8665295839309692, "num_tokens": 356339481.0, "step": 9338 }, { "epoch": 1.1880167917567739, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.71180725097656, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8359782099723816, "num_tokens": 356387038.0, "step": 9339 }, { "epoch": 1.1881440020353644, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.682701110839844, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.854576051235199, "num_tokens": 356433256.0, "step": 9340 }, { "epoch": 1.188271212313955, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.487159729003906, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8473144769668579, "num_tokens": 356471282.0, "step": 9341 }, { "epoch": 1.1883984225925455, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.81902313232422, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8642252087593079, "num_tokens": 356508951.0, "step": 9342 }, { "epoch": 1.188525632871136, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.360435485839844, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8622606992721558, "num_tokens": 356544487.0, "step": 9343 }, { "epoch": 1.1886528431497265, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.7021369934082, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8650623559951782, "num_tokens": 356580873.0, "step": 9344 }, { "epoch": 1.188780053428317, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.92447280883789, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8562303185462952, "num_tokens": 356613622.0, "step": 9345 }, { "epoch": 1.1889072637069076, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.17630386352539, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8361893892288208, "num_tokens": 356660058.0, "step": 9346 }, { "epoch": 1.189034473985498, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.6820068359375, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8582408428192139, "num_tokens": 356701614.0, "step": 9347 }, { "epoch": 1.1891616842640884, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.11712646484375, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8741520047187805, "num_tokens": 356737105.0, "step": 9348 }, { "epoch": 1.189288894542679, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.941261291503906, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8491774201393127, "num_tokens": 356779382.0, "step": 9349 }, { "epoch": 1.1894161048212695, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.87297439575195, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.852977991104126, "num_tokens": 356813836.0, "step": 9350 }, { "epoch": 1.18954331509986, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.219661712646484, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8489829301834106, "num_tokens": 356852591.0, "step": 9351 }, { "epoch": 1.1896705253784505, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.74246597290039, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8569515347480774, "num_tokens": 356889940.0, "step": 9352 }, { "epoch": 1.189797735657041, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.094329833984375, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8619264960289001, "num_tokens": 356927878.0, "step": 9353 }, { "epoch": 1.1899249459356316, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.133544921875, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8673270344734192, "num_tokens": 356963339.0, "step": 9354 }, { "epoch": 1.1900521562142221, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.286956787109375, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8621295690536499, "num_tokens": 357002999.0, "step": 9355 }, { "epoch": 1.1901793664928126, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.02729034423828, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8611290454864502, "num_tokens": 357044305.0, "step": 9356 }, { "epoch": 1.1903065767714032, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.20306396484375, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.859720766544342, "num_tokens": 357082197.0, "step": 9357 }, { "epoch": 1.1904337870499937, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.85546112060547, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8399885296821594, "num_tokens": 357119541.0, "step": 9358 }, { "epoch": 1.1905609973285842, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.24654769897461, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8797063827514648, "num_tokens": 357159176.0, "step": 9359 }, { "epoch": 1.1906882076071748, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.611412048339844, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8583146333694458, "num_tokens": 357199204.0, "step": 9360 }, { "epoch": 1.1908154178857653, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.354148864746094, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8736181259155273, "num_tokens": 357236900.0, "step": 9361 }, { "epoch": 1.1909426281643556, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.57355499267578, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8606148362159729, "num_tokens": 357271622.0, "step": 9362 }, { "epoch": 1.1910698384429461, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.614585876464844, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8701566457748413, "num_tokens": 357311145.0, "step": 9363 }, { "epoch": 1.1911970487215366, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.50566101074219, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8747767806053162, "num_tokens": 357344609.0, "step": 9364 }, { "epoch": 1.1913242590001272, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.45229721069336, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8561779260635376, "num_tokens": 357378896.0, "step": 9365 }, { "epoch": 1.1914514692787177, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.639488220214844, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8634582757949829, "num_tokens": 357424730.0, "step": 9366 }, { "epoch": 1.1915786795573082, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.68675231933594, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8399714827537537, "num_tokens": 357461152.0, "step": 9367 }, { "epoch": 1.1917058898358988, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.57502746582031, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8513631820678711, "num_tokens": 357502305.0, "step": 9368 }, { "epoch": 1.1918331001144893, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.593387603759766, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8524234890937805, "num_tokens": 357539187.0, "step": 9369 }, { "epoch": 1.1919603103930798, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.75175094604492, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8485103845596313, "num_tokens": 357569520.0, "step": 9370 }, { "epoch": 1.1920875206716703, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.46897888183594, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8588415384292603, "num_tokens": 357605253.0, "step": 9371 }, { "epoch": 1.1922147309502609, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.682796478271484, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8690775632858276, "num_tokens": 357639701.0, "step": 9372 }, { "epoch": 1.1923419412288512, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.38011169433594, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8680087327957153, "num_tokens": 357675611.0, "step": 9373 }, { "epoch": 1.1924691515074417, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.55803680419922, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8705461025238037, "num_tokens": 357708521.0, "step": 9374 }, { "epoch": 1.1925963617860322, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.62636947631836, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8496545553207397, "num_tokens": 357749970.0, "step": 9375 }, { "epoch": 1.1927235720646228, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.76519012451172, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.869856059551239, "num_tokens": 357783287.0, "step": 9376 }, { "epoch": 1.1928507823432133, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.203590393066406, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8589799404144287, "num_tokens": 357818883.0, "step": 9377 }, { "epoch": 1.1929779926218038, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.92898178100586, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8735154867172241, "num_tokens": 357862862.0, "step": 9378 }, { "epoch": 1.1931052029003943, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.8672981262207, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8477494120597839, "num_tokens": 357901574.0, "step": 9379 }, { "epoch": 1.1932324131789849, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 36.905677795410156, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8581755757331848, "num_tokens": 357941822.0, "step": 9380 }, { "epoch": 1.1933596234575754, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.67448806762695, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8520541787147522, "num_tokens": 357982820.0, "step": 9381 }, { "epoch": 1.193486833736166, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.0915412902832, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8563703298568726, "num_tokens": 358020491.0, "step": 9382 }, { "epoch": 1.1936140440147565, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.75590896606445, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8609598278999329, "num_tokens": 358059775.0, "step": 9383 }, { "epoch": 1.193741254293347, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.30792236328125, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8669158220291138, "num_tokens": 358098672.0, "step": 9384 }, { "epoch": 1.1938684645719375, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.78034210205078, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8531618118286133, "num_tokens": 358140850.0, "step": 9385 }, { "epoch": 1.193995674850528, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.211055755615234, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8668354749679565, "num_tokens": 358175503.0, "step": 9386 }, { "epoch": 1.1941228851291183, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.13129806518555, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8607755899429321, "num_tokens": 358210615.0, "step": 9387 }, { "epoch": 1.1942500954077089, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.150848388671875, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8617523908615112, "num_tokens": 358246605.0, "step": 9388 }, { "epoch": 1.1943773056862994, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.748046875, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8563544750213623, "num_tokens": 358283136.0, "step": 9389 }, { "epoch": 1.19450451596489, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 36.79947280883789, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8678394556045532, "num_tokens": 358325437.0, "step": 9390 }, { "epoch": 1.1946317262434805, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.85931396484375, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8591905832290649, "num_tokens": 358365360.0, "step": 9391 }, { "epoch": 1.194758936522071, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.128334045410156, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8755643367767334, "num_tokens": 358406502.0, "step": 9392 }, { "epoch": 1.1948861468006615, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.69831466674805, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8550516366958618, "num_tokens": 358437220.0, "step": 9393 }, { "epoch": 1.195013357079252, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.58205032348633, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8470982313156128, "num_tokens": 358475314.0, "step": 9394 }, { "epoch": 1.1951405673578426, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.620609283447266, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8689087629318237, "num_tokens": 358520777.0, "step": 9395 }, { "epoch": 1.195267777636433, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.4130973815918, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8792346715927124, "num_tokens": 358557973.0, "step": 9396 }, { "epoch": 1.1953949879150234, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.594329833984375, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8671252727508545, "num_tokens": 358596795.0, "step": 9397 }, { "epoch": 1.195522198193614, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.81144332885742, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8600977063179016, "num_tokens": 358632146.0, "step": 9398 }, { "epoch": 1.1956494084722045, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.38582229614258, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.867271363735199, "num_tokens": 358672194.0, "step": 9399 }, { "epoch": 1.195776618750795, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.469566345214844, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8670868873596191, "num_tokens": 358710192.0, "step": 9400 }, { "epoch": 1.1959038290293855, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.64277267456055, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.845534086227417, "num_tokens": 358749284.0, "step": 9401 }, { "epoch": 1.196031039307976, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.39732360839844, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8542931079864502, "num_tokens": 358782830.0, "step": 9402 }, { "epoch": 1.1961582495865666, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.50489044189453, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8584825992584229, "num_tokens": 358820696.0, "step": 9403 }, { "epoch": 1.196285459865157, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.381690979003906, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8664259910583496, "num_tokens": 358853616.0, "step": 9404 }, { "epoch": 1.1964126701437476, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.558250427246094, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8680939674377441, "num_tokens": 358893420.0, "step": 9405 }, { "epoch": 1.1965398804223382, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.270721435546875, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8626334071159363, "num_tokens": 358930704.0, "step": 9406 }, { "epoch": 1.1966670907009287, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.58536148071289, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8735657930374146, "num_tokens": 358967730.0, "step": 9407 }, { "epoch": 1.1967943009795192, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.226097106933594, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.871508002281189, "num_tokens": 359007241.0, "step": 9408 }, { "epoch": 1.1969215112581097, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.65555953979492, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8459111452102661, "num_tokens": 359045573.0, "step": 9409 }, { "epoch": 1.1970487215367003, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.25178527832031, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8627961874008179, "num_tokens": 359083420.0, "step": 9410 }, { "epoch": 1.1971759318152906, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.57761001586914, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8715444803237915, "num_tokens": 359117555.0, "step": 9411 }, { "epoch": 1.1973031420938811, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.52775192260742, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8571304678916931, "num_tokens": 359154597.0, "step": 9412 }, { "epoch": 1.1974303523724716, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.59672927856445, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8536834120750427, "num_tokens": 359190074.0, "step": 9413 }, { "epoch": 1.1975575626510622, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.70693588256836, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8481519818305969, "num_tokens": 359232711.0, "step": 9414 }, { "epoch": 1.1976847729296527, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.62240982055664, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8594903945922852, "num_tokens": 359265582.0, "step": 9415 }, { "epoch": 1.1978119832082432, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.93710708618164, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8678620457649231, "num_tokens": 359303177.0, "step": 9416 }, { "epoch": 1.1979391934868338, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.26689910888672, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8691408038139343, "num_tokens": 359338712.0, "step": 9417 }, { "epoch": 1.1980664037654243, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.929866790771484, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8684400320053101, "num_tokens": 359372849.0, "step": 9418 }, { "epoch": 1.1981936140440148, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.24385070800781, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8649025559425354, "num_tokens": 359407981.0, "step": 9419 }, { "epoch": 1.1983208243226053, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.499542236328125, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.839256763458252, "num_tokens": 359450180.0, "step": 9420 }, { "epoch": 1.1984480346011959, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.5588493347168, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8589707612991333, "num_tokens": 359487909.0, "step": 9421 }, { "epoch": 1.1985752448797862, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.52417755126953, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8699700832366943, "num_tokens": 359522609.0, "step": 9422 }, { "epoch": 1.1987024551583767, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.39522933959961, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8577541708946228, "num_tokens": 359560735.0, "step": 9423 }, { "epoch": 1.1988296654369672, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.232234954833984, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8537405729293823, "num_tokens": 359602994.0, "step": 9424 }, { "epoch": 1.1989568757155578, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.295047760009766, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8608402609825134, "num_tokens": 359638864.0, "step": 9425 }, { "epoch": 1.1990840859941483, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.573970794677734, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8681918978691101, "num_tokens": 359674202.0, "step": 9426 }, { "epoch": 1.1992112962727388, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.15765380859375, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8657220005989075, "num_tokens": 359710408.0, "step": 9427 }, { "epoch": 1.1993385065513293, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.981204986572266, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8664698600769043, "num_tokens": 359749068.0, "step": 9428 }, { "epoch": 1.1994657168299199, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 36.85573959350586, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8615576028823853, "num_tokens": 359788449.0, "step": 9429 }, { "epoch": 1.1995929271085104, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.053653717041016, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8539223670959473, "num_tokens": 359824167.0, "step": 9430 }, { "epoch": 1.199720137387101, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.0496940612793, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8540287017822266, "num_tokens": 359861476.0, "step": 9431 }, { "epoch": 1.1998473476656915, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.94530487060547, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8577556610107422, "num_tokens": 359906463.0, "step": 9432 }, { "epoch": 1.199974557944282, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.105831146240234, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.864177942276001, "num_tokens": 359940576.0, "step": 9433 }, { "epoch": 1.2001017682228725, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.14316940307617, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8703008890151978, "num_tokens": 359973616.0, "step": 9434 }, { "epoch": 1.200228978501463, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.529170989990234, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8719425201416016, "num_tokens": 360011059.0, "step": 9435 }, { "epoch": 1.2003561887800533, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.273502349853516, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8685026168823242, "num_tokens": 360048307.0, "step": 9436 }, { "epoch": 1.2004833990586439, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 38.094627380371094, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8602355718612671, "num_tokens": 360086946.0, "step": 9437 }, { "epoch": 1.2006106093372344, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.19095993041992, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.865274965763092, "num_tokens": 360121908.0, "step": 9438 }, { "epoch": 1.200737819615825, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.716678619384766, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8506084084510803, "num_tokens": 360164349.0, "step": 9439 }, { "epoch": 1.2008650298944155, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.15333938598633, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8757631182670593, "num_tokens": 360197660.0, "step": 9440 }, { "epoch": 1.200992240173006, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.82875442504883, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8388131856918335, "num_tokens": 360240297.0, "step": 9441 }, { "epoch": 1.2011194504515965, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.20724105834961, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8504270315170288, "num_tokens": 360281566.0, "step": 9442 }, { "epoch": 1.201246660730187, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.72621154785156, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8652761578559875, "num_tokens": 360313927.0, "step": 9443 }, { "epoch": 1.2013738710087776, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.05769729614258, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.864084005355835, "num_tokens": 360349369.0, "step": 9444 }, { "epoch": 1.201501081287368, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.82086181640625, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8588488101959229, "num_tokens": 360386996.0, "step": 9445 }, { "epoch": 1.2016282915659584, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.172119140625, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8644460439682007, "num_tokens": 360424883.0, "step": 9446 }, { "epoch": 1.201755501844549, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.78794860839844, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8653265833854675, "num_tokens": 360457992.0, "step": 9447 }, { "epoch": 1.2018827121231395, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.308223724365234, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8449268937110901, "num_tokens": 360497934.0, "step": 9448 }, { "epoch": 1.20200992240173, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.69020080566406, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8573887348175049, "num_tokens": 360533928.0, "step": 9449 }, { "epoch": 1.2021371326803205, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.234703063964844, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8630573749542236, "num_tokens": 360571076.0, "step": 9450 }, { "epoch": 1.202264342958911, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.882137298583984, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8526335954666138, "num_tokens": 360611957.0, "step": 9451 }, { "epoch": 1.2023915532375016, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.40413284301758, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8532344698905945, "num_tokens": 360647823.0, "step": 9452 }, { "epoch": 1.202518763516092, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.768314361572266, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8521567583084106, "num_tokens": 360688341.0, "step": 9453 }, { "epoch": 1.2026459737946826, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.761627197265625, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.852522611618042, "num_tokens": 360730417.0, "step": 9454 }, { "epoch": 1.2027731840732732, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.29533386230469, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8394328355789185, "num_tokens": 360771931.0, "step": 9455 }, { "epoch": 1.2029003943518637, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.43770980834961, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.857878565788269, "num_tokens": 360811460.0, "step": 9456 }, { "epoch": 1.2030276046304542, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.913944244384766, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8721622228622437, "num_tokens": 360849476.0, "step": 9457 }, { "epoch": 1.2031548149090447, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.52180480957031, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8509576320648193, "num_tokens": 360891178.0, "step": 9458 }, { "epoch": 1.2032820251876353, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.81719207763672, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8552021980285645, "num_tokens": 360930503.0, "step": 9459 }, { "epoch": 1.2034092354662256, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.670623779296875, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8564609289169312, "num_tokens": 360963730.0, "step": 9460 }, { "epoch": 1.203536445744816, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.61813735961914, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8465993404388428, "num_tokens": 361003796.0, "step": 9461 }, { "epoch": 1.2036636560234066, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.744232177734375, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8600820899009705, "num_tokens": 361044818.0, "step": 9462 }, { "epoch": 1.2037908663019972, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.82917404174805, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8707230091094971, "num_tokens": 361080692.0, "step": 9463 }, { "epoch": 1.2039180765805877, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.52390670776367, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8499865531921387, "num_tokens": 361116609.0, "step": 9464 }, { "epoch": 1.2040452868591782, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.690574645996094, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8519651889801025, "num_tokens": 361154559.0, "step": 9465 }, { "epoch": 1.2041724971377687, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.52983856201172, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8672451972961426, "num_tokens": 361191005.0, "step": 9466 }, { "epoch": 1.2042997074163593, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.70370864868164, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8574255704879761, "num_tokens": 361230191.0, "step": 9467 }, { "epoch": 1.2044269176949498, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.73542404174805, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8535642623901367, "num_tokens": 361269132.0, "step": 9468 }, { "epoch": 1.2045541279735403, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.2626838684082, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8525393605232239, "num_tokens": 361306674.0, "step": 9469 }, { "epoch": 1.2046813382521309, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.68396759033203, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8607794642448425, "num_tokens": 361348309.0, "step": 9470 }, { "epoch": 1.2048085485307212, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.545467376708984, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8550987243652344, "num_tokens": 361386824.0, "step": 9471 }, { "epoch": 1.2049357588093117, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.45073699951172, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8397760987281799, "num_tokens": 361426590.0, "step": 9472 }, { "epoch": 1.2050629690879022, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.56642532348633, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8446773886680603, "num_tokens": 361463259.0, "step": 9473 }, { "epoch": 1.2051901793664928, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.657859802246094, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8687645792961121, "num_tokens": 361498750.0, "step": 9474 }, { "epoch": 1.2053173896450833, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.57989501953125, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8496841788291931, "num_tokens": 361537101.0, "step": 9475 }, { "epoch": 1.2054445999236738, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.7640380859375, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8616781234741211, "num_tokens": 361572130.0, "step": 9476 }, { "epoch": 1.2055718102022643, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.53883743286133, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8745659589767456, "num_tokens": 361608484.0, "step": 9477 }, { "epoch": 1.2056990204808549, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.880462646484375, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.878186821937561, "num_tokens": 361650361.0, "step": 9478 }, { "epoch": 1.2058262307594454, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.55178451538086, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8665482401847839, "num_tokens": 361680670.0, "step": 9479 }, { "epoch": 1.205953441038036, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.693904876708984, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8653364181518555, "num_tokens": 361723504.0, "step": 9480 }, { "epoch": 1.2060806513166265, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.38311767578125, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8669458627700806, "num_tokens": 361765850.0, "step": 9481 }, { "epoch": 1.206207861595217, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.73705291748047, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8502448201179504, "num_tokens": 361804375.0, "step": 9482 }, { "epoch": 1.2063350718738075, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.515926361083984, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8565375804901123, "num_tokens": 361834714.0, "step": 9483 }, { "epoch": 1.206462282152398, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.7688102722168, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8596063852310181, "num_tokens": 361877568.0, "step": 9484 }, { "epoch": 1.2065894924309883, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.36825180053711, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8557700514793396, "num_tokens": 361914785.0, "step": 9485 }, { "epoch": 1.2067167027095789, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.90168380737305, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8686223030090332, "num_tokens": 361956781.0, "step": 9486 }, { "epoch": 1.2068439129881694, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.27571105957031, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8692125082015991, "num_tokens": 361997408.0, "step": 9487 }, { "epoch": 1.20697112326676, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.984718322753906, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8581176996231079, "num_tokens": 362030226.0, "step": 9488 }, { "epoch": 1.2070983335453505, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 36.98953628540039, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8708735108375549, "num_tokens": 362068370.0, "step": 9489 }, { "epoch": 1.207225543823941, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.241092681884766, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8680897951126099, "num_tokens": 362108569.0, "step": 9490 }, { "epoch": 1.2073527541025315, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.00694274902344, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8508681654930115, "num_tokens": 362140476.0, "step": 9491 }, { "epoch": 1.207479964381122, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.139869689941406, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8671349287033081, "num_tokens": 362171132.0, "step": 9492 }, { "epoch": 1.2076071746597126, "ewc_loss": 0.11865234375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.14140701293945, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8587065935134888, "num_tokens": 362204791.0, "step": 9493 }, { "epoch": 1.207734384938303, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.77138137817383, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.86228346824646, "num_tokens": 362242360.0, "step": 9494 }, { "epoch": 1.2078615952168934, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.00763702392578, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8616039752960205, "num_tokens": 362276558.0, "step": 9495 }, { "epoch": 1.207988805495484, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.624046325683594, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8446297645568848, "num_tokens": 362307055.0, "step": 9496 }, { "epoch": 1.2081160157740745, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.96985626220703, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8702490329742432, "num_tokens": 362342433.0, "step": 9497 }, { "epoch": 1.208243226052665, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.53690719604492, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8813865184783936, "num_tokens": 362377008.0, "step": 9498 }, { "epoch": 1.2083704363312555, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.23151397705078, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8754321336746216, "num_tokens": 362412089.0, "step": 9499 }, { "epoch": 1.208497646609846, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.32276153564453, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8520482778549194, "num_tokens": 362455817.0, "step": 9500 }, { "epoch": 1.2086248568884366, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.085174560546875, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8617826700210571, "num_tokens": 362498859.0, "step": 9501 }, { "epoch": 1.208752067167027, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.440765380859375, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8602917194366455, "num_tokens": 362542346.0, "step": 9502 }, { "epoch": 1.2088792774456176, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.58161926269531, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8784452676773071, "num_tokens": 362575360.0, "step": 9503 }, { "epoch": 1.2090064877242082, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.45185089111328, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8606914281845093, "num_tokens": 362614737.0, "step": 9504 }, { "epoch": 1.2091336980027987, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.624122619628906, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8554121255874634, "num_tokens": 362653756.0, "step": 9505 }, { "epoch": 1.2092609082813892, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.278961181640625, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8611209988594055, "num_tokens": 362690745.0, "step": 9506 }, { "epoch": 1.2093881185599797, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.58832550048828, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8690619468688965, "num_tokens": 362726328.0, "step": 9507 }, { "epoch": 1.2095153288385703, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.955169677734375, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8626535534858704, "num_tokens": 362764285.0, "step": 9508 }, { "epoch": 1.2096425391171606, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.785926818847656, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8410691022872925, "num_tokens": 362794697.0, "step": 9509 }, { "epoch": 1.209769749395751, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.01420593261719, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8573921322822571, "num_tokens": 362831618.0, "step": 9510 }, { "epoch": 1.2098969596743416, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.63282775878906, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8678359985351562, "num_tokens": 362866404.0, "step": 9511 }, { "epoch": 1.2100241699529322, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.153961181640625, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8768891096115112, "num_tokens": 362902554.0, "step": 9512 }, { "epoch": 1.2101513802315227, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.2301025390625, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8549544215202332, "num_tokens": 362944169.0, "step": 9513 }, { "epoch": 1.2102785905101132, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.36323928833008, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8390481472015381, "num_tokens": 362983789.0, "step": 9514 }, { "epoch": 1.2104058007887037, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.32427215576172, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8666516542434692, "num_tokens": 363021551.0, "step": 9515 }, { "epoch": 1.2105330110672943, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.81718063354492, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8427128195762634, "num_tokens": 363064951.0, "step": 9516 }, { "epoch": 1.2106602213458848, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.33668518066406, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8593176603317261, "num_tokens": 363110876.0, "step": 9517 }, { "epoch": 1.2107874316244753, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.633262634277344, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8586156964302063, "num_tokens": 363155310.0, "step": 9518 }, { "epoch": 1.2109146419030659, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.144081115722656, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8704454302787781, "num_tokens": 363193166.0, "step": 9519 }, { "epoch": 1.2110418521816562, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.7359733581543, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8392652273178101, "num_tokens": 363225163.0, "step": 9520 }, { "epoch": 1.2111690624602467, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.13378143310547, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8583775758743286, "num_tokens": 363264720.0, "step": 9521 }, { "epoch": 1.2112962727388372, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.69523620605469, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8445608615875244, "num_tokens": 363305029.0, "step": 9522 }, { "epoch": 1.2114234830174277, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.05274200439453, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8682828545570374, "num_tokens": 363344128.0, "step": 9523 }, { "epoch": 1.2115506932960183, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.7309684753418, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8593876957893372, "num_tokens": 363385903.0, "step": 9524 }, { "epoch": 1.2116779035746088, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.92673873901367, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.86112380027771, "num_tokens": 363425216.0, "step": 9525 }, { "epoch": 1.2118051138531993, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.46624755859375, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8709515333175659, "num_tokens": 363462691.0, "step": 9526 }, { "epoch": 1.2119323241317899, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.28778839111328, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.844745397567749, "num_tokens": 363495849.0, "step": 9527 }, { "epoch": 1.2120595344103804, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.303585052490234, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8586723208427429, "num_tokens": 363535268.0, "step": 9528 }, { "epoch": 1.212186744688971, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.37192153930664, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8422667384147644, "num_tokens": 363573523.0, "step": 9529 }, { "epoch": 1.2123139549675614, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.179656982421875, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8596096038818359, "num_tokens": 363607545.0, "step": 9530 }, { "epoch": 1.212441165246152, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.58517837524414, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8613066077232361, "num_tokens": 363648456.0, "step": 9531 }, { "epoch": 1.2125683755247425, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.65011978149414, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8692211508750916, "num_tokens": 363688726.0, "step": 9532 }, { "epoch": 1.212695585803333, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.36170959472656, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8524119853973389, "num_tokens": 363728557.0, "step": 9533 }, { "epoch": 1.2128227960819233, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.47816848754883, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8600814938545227, "num_tokens": 363764191.0, "step": 9534 }, { "epoch": 1.2129500063605139, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.2650032043457, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8471215963363647, "num_tokens": 363798126.0, "step": 9535 }, { "epoch": 1.2130772166391044, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.732601165771484, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8445611596107483, "num_tokens": 363836037.0, "step": 9536 }, { "epoch": 1.213204426917695, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.29001998901367, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8807044625282288, "num_tokens": 363872359.0, "step": 9537 }, { "epoch": 1.2133316371962855, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.86689758300781, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8551003932952881, "num_tokens": 363914206.0, "step": 9538 }, { "epoch": 1.213458847474876, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.33587646484375, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8618720769882202, "num_tokens": 363956147.0, "step": 9539 }, { "epoch": 1.2135860577534665, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.469905853271484, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8772016167640686, "num_tokens": 363986328.0, "step": 9540 }, { "epoch": 1.213713268032057, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.25044250488281, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8580800294876099, "num_tokens": 364024905.0, "step": 9541 }, { "epoch": 1.2138404783106476, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.37737274169922, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8619166612625122, "num_tokens": 364068628.0, "step": 9542 }, { "epoch": 1.213967688589238, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.286678314208984, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.855470597743988, "num_tokens": 364104261.0, "step": 9543 }, { "epoch": 1.2140948988678284, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.310401916503906, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8565755486488342, "num_tokens": 364147524.0, "step": 9544 }, { "epoch": 1.214222109146419, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.40052032470703, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8610512018203735, "num_tokens": 364187715.0, "step": 9545 }, { "epoch": 1.2143493194250095, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.40976333618164, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8700414896011353, "num_tokens": 364226081.0, "step": 9546 }, { "epoch": 1.2144765297036, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.473976135253906, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8597232103347778, "num_tokens": 364268390.0, "step": 9547 }, { "epoch": 1.2146037399821905, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.17204284667969, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8618070483207703, "num_tokens": 364310176.0, "step": 9548 }, { "epoch": 1.214730950260781, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.79196548461914, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.870553731918335, "num_tokens": 364345652.0, "step": 9549 }, { "epoch": 1.2148581605393716, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.9549446105957, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8551114797592163, "num_tokens": 364383810.0, "step": 9550 }, { "epoch": 1.214985370817962, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.66041564941406, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8637940883636475, "num_tokens": 364421169.0, "step": 9551 }, { "epoch": 1.2151125810965526, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.20233917236328, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8545188903808594, "num_tokens": 364452552.0, "step": 9552 }, { "epoch": 1.2152397913751432, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.3453369140625, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8568559885025024, "num_tokens": 364486779.0, "step": 9553 }, { "epoch": 1.2153670016537337, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.87802505493164, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8709967732429504, "num_tokens": 364526237.0, "step": 9554 }, { "epoch": 1.2154942119323242, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.9655876159668, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8665995597839355, "num_tokens": 364568446.0, "step": 9555 }, { "epoch": 1.2156214222109147, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.76626205444336, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8669670224189758, "num_tokens": 364604448.0, "step": 9556 }, { "epoch": 1.2157486324895053, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 36.93923568725586, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.858100950717926, "num_tokens": 364642707.0, "step": 9557 }, { "epoch": 1.2158758427680956, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.09687805175781, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.854254961013794, "num_tokens": 364678428.0, "step": 9558 }, { "epoch": 1.216003053046686, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.25458526611328, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8742721080780029, "num_tokens": 364718915.0, "step": 9559 }, { "epoch": 1.2161302633252766, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.97058868408203, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.876178503036499, "num_tokens": 364756030.0, "step": 9560 }, { "epoch": 1.2162574736038672, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.62513732910156, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8576260805130005, "num_tokens": 364796158.0, "step": 9561 }, { "epoch": 1.2163846838824577, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.86098098754883, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8671126961708069, "num_tokens": 364832722.0, "step": 9562 }, { "epoch": 1.2165118941610482, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.55484390258789, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8600764274597168, "num_tokens": 364874120.0, "step": 9563 }, { "epoch": 1.2166391044396387, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.755611419677734, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.862303614616394, "num_tokens": 364913166.0, "step": 9564 }, { "epoch": 1.2167663147182293, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.49312973022461, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8424363136291504, "num_tokens": 364949586.0, "step": 9565 }, { "epoch": 1.2168935249968198, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.574012756347656, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.842063844203949, "num_tokens": 364989070.0, "step": 9566 }, { "epoch": 1.2170207352754103, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.780235290527344, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8647613525390625, "num_tokens": 365027050.0, "step": 9567 }, { "epoch": 1.2171479455540009, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.47941207885742, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8589467406272888, "num_tokens": 365060050.0, "step": 9568 }, { "epoch": 1.2172751558325912, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.069847106933594, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.8292433023452759, "num_tokens": 365098030.0, "step": 9569 }, { "epoch": 1.2174023661111817, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.355377197265625, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8622211813926697, "num_tokens": 365141143.0, "step": 9570 }, { "epoch": 1.2175295763897722, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.02937698364258, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.835639238357544, "num_tokens": 365174204.0, "step": 9571 }, { "epoch": 1.2176567866683627, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.63410568237305, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8607316017150879, "num_tokens": 365207170.0, "step": 9572 }, { "epoch": 1.2177839969469533, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.578426361083984, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8627530336380005, "num_tokens": 365248926.0, "step": 9573 }, { "epoch": 1.2179112072255438, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.717411041259766, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8625160455703735, "num_tokens": 365283588.0, "step": 9574 }, { "epoch": 1.2180384175041343, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.67702865600586, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8576557040214539, "num_tokens": 365324869.0, "step": 9575 }, { "epoch": 1.2181656277827249, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.9405403137207, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8563199639320374, "num_tokens": 365366832.0, "step": 9576 }, { "epoch": 1.2182928380613154, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.619163513183594, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8571381568908691, "num_tokens": 365401148.0, "step": 9577 }, { "epoch": 1.218420048339906, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.064430236816406, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8619827032089233, "num_tokens": 365442303.0, "step": 9578 }, { "epoch": 1.2185472586184964, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.47205352783203, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8716745376586914, "num_tokens": 365477877.0, "step": 9579 }, { "epoch": 1.218674468897087, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.938720703125, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8466837406158447, "num_tokens": 365520648.0, "step": 9580 }, { "epoch": 1.2188016791756775, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.842533111572266, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8420315980911255, "num_tokens": 365555129.0, "step": 9581 }, { "epoch": 1.218928889454268, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.898101806640625, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8654310703277588, "num_tokens": 365594368.0, "step": 9582 }, { "epoch": 1.2190560997328583, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 38.28413772583008, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.858416736125946, "num_tokens": 365634763.0, "step": 9583 }, { "epoch": 1.2191833100114489, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.70566177368164, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8756067752838135, "num_tokens": 365675951.0, "step": 9584 }, { "epoch": 1.2193105202900394, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.221656799316406, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8675293326377869, "num_tokens": 365710348.0, "step": 9585 }, { "epoch": 1.21943773056863, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.570281982421875, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8615593910217285, "num_tokens": 365745166.0, "step": 9586 }, { "epoch": 1.2195649408472204, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.450679779052734, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8722178936004639, "num_tokens": 365780122.0, "step": 9587 }, { "epoch": 1.219692151125811, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.492713928222656, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8574854135513306, "num_tokens": 365816326.0, "step": 9588 }, { "epoch": 1.2198193614044015, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 38.12946319580078, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8332899808883667, "num_tokens": 365858938.0, "step": 9589 }, { "epoch": 1.219946571682992, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.84367752075195, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8495712280273438, "num_tokens": 365893892.0, "step": 9590 }, { "epoch": 1.2200737819615826, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.881080627441406, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8493417501449585, "num_tokens": 365938787.0, "step": 9591 }, { "epoch": 1.220200992240173, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 38.360496520996094, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.85857093334198, "num_tokens": 365972664.0, "step": 9592 }, { "epoch": 1.2203282025187634, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.192840576171875, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8657436370849609, "num_tokens": 366013367.0, "step": 9593 }, { "epoch": 1.220455412797354, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.350765228271484, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8654438257217407, "num_tokens": 366051637.0, "step": 9594 }, { "epoch": 1.2205826230759445, "ewc_loss": 0.1181640625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.918212890625e-05, "grad_norm": 37.193416595458984, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8569285869598389, "num_tokens": 366092713.0, "step": 9595 }, { "epoch": 1.220709833354535, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.23862075805664, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8601939678192139, "num_tokens": 366131594.0, "step": 9596 }, { "epoch": 1.2208370436331255, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.51210403442383, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8550607562065125, "num_tokens": 366170871.0, "step": 9597 }, { "epoch": 1.220964253911716, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.34850311279297, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8488084077835083, "num_tokens": 366212974.0, "step": 9598 }, { "epoch": 1.2210914641903066, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 9.965896606445312e-05, "grad_norm": 37.66267776489258, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8503491878509521, "num_tokens": 366255904.0, "step": 9599 }, { "epoch": 1.221218674468897, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.600135803222656, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8653420209884644, "num_tokens": 366288887.0, "step": 9600 }, { "epoch": 1.2213458847474876, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.9969482421875, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8739428520202637, "num_tokens": 366328796.0, "step": 9601 }, { "epoch": 1.2214730950260781, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.731788635253906, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8560690879821777, "num_tokens": 366373093.0, "step": 9602 }, { "epoch": 1.2216003053046687, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 38.051048278808594, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.868527889251709, "num_tokens": 366414353.0, "step": 9603 }, { "epoch": 1.2217275155832592, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.70124816894531, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8739780187606812, "num_tokens": 366450683.0, "step": 9604 }, { "epoch": 1.2218547258618497, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 38.23414611816406, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8657009601593018, "num_tokens": 366487158.0, "step": 9605 }, { "epoch": 1.2219819361404403, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.389976501464844, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8609811067581177, "num_tokens": 366529984.0, "step": 9606 }, { "epoch": 1.2221091464190306, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.221866607666016, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8630478978157043, "num_tokens": 366564100.0, "step": 9607 }, { "epoch": 1.222236356697621, "ewc_loss": 0.119140625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.956993103027344, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8659136295318604, "num_tokens": 366603752.0, "step": 9608 }, { "epoch": 1.2223635669762116, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 38.01067352294922, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8619546890258789, "num_tokens": 366642170.0, "step": 9609 }, { "epoch": 1.2224907772548022, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.770565032958984, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8463047742843628, "num_tokens": 366684038.0, "step": 9610 }, { "epoch": 1.2226179875333927, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.872127532958984, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8577141761779785, "num_tokens": 366717832.0, "step": 9611 }, { "epoch": 1.2227451978119832, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.59503936767578, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8533114194869995, "num_tokens": 366757999.0, "step": 9612 }, { "epoch": 1.2228724080905737, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.87779998779297, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8684293627738953, "num_tokens": 366798533.0, "step": 9613 }, { "epoch": 1.2229996183691643, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.37996292114258, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8557934165000916, "num_tokens": 366843066.0, "step": 9614 }, { "epoch": 1.2231268286477548, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.834476470947266, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8754587173461914, "num_tokens": 366883190.0, "step": 9615 }, { "epoch": 1.2232540389263453, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.71086120605469, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8639817237854004, "num_tokens": 366920642.0, "step": 9616 }, { "epoch": 1.2233812492049359, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.4253044128418, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8611612319946289, "num_tokens": 366954468.0, "step": 9617 }, { "epoch": 1.2235084594835262, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 38.22969436645508, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8433562517166138, "num_tokens": 366991170.0, "step": 9618 }, { "epoch": 1.2236356697621167, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.348384857177734, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8644397258758545, "num_tokens": 367031833.0, "step": 9619 }, { "epoch": 1.2237628800407072, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.24665069580078, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8550060987472534, "num_tokens": 367074709.0, "step": 9620 }, { "epoch": 1.2238900903192977, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.35068893432617, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8576360940933228, "num_tokens": 367106820.0, "step": 9621 }, { "epoch": 1.2240173005978883, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.337196350097656, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8752360343933105, "num_tokens": 367142298.0, "step": 9622 }, { "epoch": 1.2241445108764788, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.374610900878906, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8495519161224365, "num_tokens": 367179957.0, "step": 9623 }, { "epoch": 1.2242717211550693, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.043094635009766, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8624851107597351, "num_tokens": 367226800.0, "step": 9624 }, { "epoch": 1.2243989314336599, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.388484954833984, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8654771447181702, "num_tokens": 367257168.0, "step": 9625 }, { "epoch": 1.2245261417122504, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.522911071777344, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8614088892936707, "num_tokens": 367295254.0, "step": 9626 }, { "epoch": 1.224653351990841, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.63078308105469, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8478432297706604, "num_tokens": 367335814.0, "step": 9627 }, { "epoch": 1.2247805622694314, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.7233772277832, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8563934564590454, "num_tokens": 367377592.0, "step": 9628 }, { "epoch": 1.224907772548022, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.85752487182617, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8518699407577515, "num_tokens": 367408069.0, "step": 9629 }, { "epoch": 1.2250349828266125, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.452396392822266, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8613067269325256, "num_tokens": 367444787.0, "step": 9630 }, { "epoch": 1.225162193105203, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.03133010864258, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8442635536193848, "num_tokens": 367486848.0, "step": 9631 }, { "epoch": 1.2252894033837933, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.282562255859375, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8697797060012817, "num_tokens": 367530456.0, "step": 9632 }, { "epoch": 1.2254166136623839, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.97509765625, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8646697402000427, "num_tokens": 367566513.0, "step": 9633 }, { "epoch": 1.2255438239409744, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.25027847290039, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8385785222053528, "num_tokens": 367607757.0, "step": 9634 }, { "epoch": 1.225671034219565, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.905059814453125, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8623250126838684, "num_tokens": 367645525.0, "step": 9635 }, { "epoch": 1.2257982444981554, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.732627868652344, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8683468103408813, "num_tokens": 367680840.0, "step": 9636 }, { "epoch": 1.225925454776746, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.22848129272461, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8686138391494751, "num_tokens": 367717444.0, "step": 9637 }, { "epoch": 1.2260526650553365, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.975669860839844, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8399905562400818, "num_tokens": 367756548.0, "step": 9638 }, { "epoch": 1.226179875333927, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.38560485839844, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8539962768554688, "num_tokens": 367798341.0, "step": 9639 }, { "epoch": 1.2263070856125176, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.3036003112793, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8674864768981934, "num_tokens": 367835087.0, "step": 9640 }, { "epoch": 1.226434295891108, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.14138412475586, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8619529008865356, "num_tokens": 367873816.0, "step": 9641 }, { "epoch": 1.2265615061696984, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.034751892089844, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8555792570114136, "num_tokens": 367913621.0, "step": 9642 }, { "epoch": 1.226688716448289, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.52252960205078, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8500909209251404, "num_tokens": 367946881.0, "step": 9643 }, { "epoch": 1.2268159267268794, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.891456604003906, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8641353845596313, "num_tokens": 367980806.0, "step": 9644 }, { "epoch": 1.22694313700547, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.83522033691406, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8672091364860535, "num_tokens": 368016041.0, "step": 9645 }, { "epoch": 1.2270703472840605, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.39841842651367, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8576053380966187, "num_tokens": 368052331.0, "step": 9646 }, { "epoch": 1.227197557562651, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.96342468261719, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.867931604385376, "num_tokens": 368091458.0, "step": 9647 }, { "epoch": 1.2273247678412416, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.725074768066406, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.847312867641449, "num_tokens": 368132433.0, "step": 9648 }, { "epoch": 1.227451978119832, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.60084915161133, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.86070716381073, "num_tokens": 368171030.0, "step": 9649 }, { "epoch": 1.2275791883984226, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.5581169128418, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8481327295303345, "num_tokens": 368211555.0, "step": 9650 }, { "epoch": 1.2277063986770131, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.85704040527344, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8668252825737, "num_tokens": 368252076.0, "step": 9651 }, { "epoch": 1.2278336089556037, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.50738525390625, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8531235456466675, "num_tokens": 368291265.0, "step": 9652 }, { "epoch": 1.2279608192341942, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.88101577758789, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8539704084396362, "num_tokens": 368332178.0, "step": 9653 }, { "epoch": 1.2280880295127847, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.66755676269531, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8588652610778809, "num_tokens": 368374103.0, "step": 9654 }, { "epoch": 1.2282152397913753, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.72785568237305, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8619716167449951, "num_tokens": 368411928.0, "step": 9655 }, { "epoch": 1.2283424500699656, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.387691497802734, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8341827392578125, "num_tokens": 368449200.0, "step": 9656 }, { "epoch": 1.228469660348556, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.831302642822266, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8409888744354248, "num_tokens": 368489935.0, "step": 9657 }, { "epoch": 1.2285968706271466, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.52573013305664, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8704671263694763, "num_tokens": 368522227.0, "step": 9658 }, { "epoch": 1.2287240809057371, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.627166748046875, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8587617874145508, "num_tokens": 368555084.0, "step": 9659 }, { "epoch": 1.2288512911843277, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.91804504394531, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8552328944206238, "num_tokens": 368591845.0, "step": 9660 }, { "epoch": 1.2289785014629182, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.90790557861328, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8478745222091675, "num_tokens": 368637841.0, "step": 9661 }, { "epoch": 1.2291057117415087, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.30654525756836, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8758260607719421, "num_tokens": 368677470.0, "step": 9662 }, { "epoch": 1.2292329220200993, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.30221176147461, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8725992441177368, "num_tokens": 368718412.0, "step": 9663 }, { "epoch": 1.2293601322986898, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.50634002685547, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.848333477973938, "num_tokens": 368759267.0, "step": 9664 }, { "epoch": 1.2294873425772803, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.85980987548828, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8484622240066528, "num_tokens": 368797586.0, "step": 9665 }, { "epoch": 1.2296145528558708, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.59467697143555, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8505672812461853, "num_tokens": 368833801.0, "step": 9666 }, { "epoch": 1.2297417631344612, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.457489013671875, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8434134721755981, "num_tokens": 368878513.0, "step": 9667 }, { "epoch": 1.2298689734130517, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.9100456237793, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8521500825881958, "num_tokens": 368917815.0, "step": 9668 }, { "epoch": 1.2299961836916422, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.243473052978516, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8477684259414673, "num_tokens": 368954971.0, "step": 9669 }, { "epoch": 1.2301233939702327, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.305728912353516, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8643829822540283, "num_tokens": 368996212.0, "step": 9670 }, { "epoch": 1.2302506042488233, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 36.94186782836914, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8721632361412048, "num_tokens": 369032186.0, "step": 9671 }, { "epoch": 1.2303778145274138, "ewc_loss": 0.125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.630645751953125, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8640730381011963, "num_tokens": 369077565.0, "step": 9672 }, { "epoch": 1.2305050248060043, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.49334716796875, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8663495779037476, "num_tokens": 369111935.0, "step": 9673 }, { "epoch": 1.2306322350845948, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.151371002197266, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.857905387878418, "num_tokens": 369154374.0, "step": 9674 }, { "epoch": 1.2307594453631854, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.92045211791992, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8502456545829773, "num_tokens": 369198557.0, "step": 9675 }, { "epoch": 1.230886655641776, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.871665954589844, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8432261943817139, "num_tokens": 369237756.0, "step": 9676 }, { "epoch": 1.2310138659203664, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.70854949951172, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8463061451911926, "num_tokens": 369270155.0, "step": 9677 }, { "epoch": 1.231141076198957, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.90697479248047, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8527535200119019, "num_tokens": 369310870.0, "step": 9678 }, { "epoch": 1.2312682864775475, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.88641357421875, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8421871662139893, "num_tokens": 369344679.0, "step": 9679 }, { "epoch": 1.231395496756138, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.03383255004883, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8578919768333435, "num_tokens": 369380388.0, "step": 9680 }, { "epoch": 1.2315227070347283, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.86606979370117, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8706567287445068, "num_tokens": 369422327.0, "step": 9681 }, { "epoch": 1.2316499173133189, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.704246520996094, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8654786944389343, "num_tokens": 369457160.0, "step": 9682 }, { "epoch": 1.2317771275919094, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.601783752441406, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8589421510696411, "num_tokens": 369497883.0, "step": 9683 }, { "epoch": 1.2319043378705, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.92387771606445, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8887268304824829, "num_tokens": 369540014.0, "step": 9684 }, { "epoch": 1.2320315481490904, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.66272735595703, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8589197993278503, "num_tokens": 369580424.0, "step": 9685 }, { "epoch": 1.232158758427681, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.97128677368164, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8545790314674377, "num_tokens": 369623615.0, "step": 9686 }, { "epoch": 1.2322859687062715, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.79469680786133, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8465284705162048, "num_tokens": 369658408.0, "step": 9687 }, { "epoch": 1.232413178984862, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.7612190246582, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8555771112442017, "num_tokens": 369696311.0, "step": 9688 }, { "epoch": 1.2325403892634526, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.6355094909668, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8449956178665161, "num_tokens": 369737473.0, "step": 9689 }, { "epoch": 1.232667599542043, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.191741943359375, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8564127087593079, "num_tokens": 369770151.0, "step": 9690 }, { "epoch": 1.2327948098206334, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.689125061035156, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8759166598320007, "num_tokens": 369813503.0, "step": 9691 }, { "epoch": 1.232922020099224, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.152366638183594, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.860975980758667, "num_tokens": 369848768.0, "step": 9692 }, { "epoch": 1.2330492303778144, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.800575256347656, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.862175464630127, "num_tokens": 369892510.0, "step": 9693 }, { "epoch": 1.233176440656405, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.39387130737305, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8670835494995117, "num_tokens": 369929638.0, "step": 9694 }, { "epoch": 1.2333036509349955, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.476356506347656, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8792228102684021, "num_tokens": 369971201.0, "step": 9695 }, { "epoch": 1.233430861213586, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.98086929321289, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.852463960647583, "num_tokens": 370011713.0, "step": 9696 }, { "epoch": 1.2335580714921766, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.1250114440918, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8386403918266296, "num_tokens": 370042647.0, "step": 9697 }, { "epoch": 1.233685281770767, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.59136199951172, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8563327789306641, "num_tokens": 370083976.0, "step": 9698 }, { "epoch": 1.2338124920493576, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.53743362426758, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8553435802459717, "num_tokens": 370126435.0, "step": 9699 }, { "epoch": 1.2339397023279481, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.47257614135742, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8675615787506104, "num_tokens": 370165038.0, "step": 9700 }, { "epoch": 1.2340669126065387, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.36333465576172, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8505936861038208, "num_tokens": 370203438.0, "step": 9701 }, { "epoch": 1.2341941228851292, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.61610794067383, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.875100314617157, "num_tokens": 370235144.0, "step": 9702 }, { "epoch": 1.2343213331637197, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.168025970458984, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8538261651992798, "num_tokens": 370278936.0, "step": 9703 }, { "epoch": 1.2344485434423103, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.74946212768555, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8620439767837524, "num_tokens": 370312595.0, "step": 9704 }, { "epoch": 1.2345757537209006, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.29032516479492, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8558698296546936, "num_tokens": 370351697.0, "step": 9705 }, { "epoch": 1.234702963999491, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.64099884033203, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8722874522209167, "num_tokens": 370391264.0, "step": 9706 }, { "epoch": 1.2348301742780816, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.994319915771484, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8669635057449341, "num_tokens": 370425248.0, "step": 9707 }, { "epoch": 1.2349573845566721, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.45035934448242, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8384634256362915, "num_tokens": 370462670.0, "step": 9708 }, { "epoch": 1.2350845948352627, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.001888275146484, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8388845920562744, "num_tokens": 370506126.0, "step": 9709 }, { "epoch": 1.2352118051138532, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.62620162963867, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.869493842124939, "num_tokens": 370546895.0, "step": 9710 }, { "epoch": 1.2353390153924437, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.041969299316406, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8637049794197083, "num_tokens": 370590645.0, "step": 9711 }, { "epoch": 1.2354662256710343, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.71711730957031, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8613314628601074, "num_tokens": 370628614.0, "step": 9712 }, { "epoch": 1.2355934359496248, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.6537971496582, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8612714409828186, "num_tokens": 370666082.0, "step": 9713 }, { "epoch": 1.2357206462282153, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.694366455078125, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8443332314491272, "num_tokens": 370705780.0, "step": 9714 }, { "epoch": 1.2358478565068058, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.71706008911133, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8651822805404663, "num_tokens": 370742550.0, "step": 9715 }, { "epoch": 1.2359750667853961, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.891357421875, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8639910221099854, "num_tokens": 370784406.0, "step": 9716 }, { "epoch": 1.2361022770639867, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.6467170715332, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8545719981193542, "num_tokens": 370820911.0, "step": 9717 }, { "epoch": 1.2362294873425772, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.68927001953125, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8685725927352905, "num_tokens": 370857427.0, "step": 9718 }, { "epoch": 1.2363566976211677, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.650089263916016, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8495864868164062, "num_tokens": 370897989.0, "step": 9719 }, { "epoch": 1.2364839078997583, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.808040618896484, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8473454713821411, "num_tokens": 370932672.0, "step": 9720 }, { "epoch": 1.2366111181783488, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.73767852783203, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8494511246681213, "num_tokens": 370971027.0, "step": 9721 }, { "epoch": 1.2367383284569393, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.65413284301758, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8697677850723267, "num_tokens": 371014742.0, "step": 9722 }, { "epoch": 1.2368655387355298, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.07350540161133, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8443864583969116, "num_tokens": 371052065.0, "step": 9723 }, { "epoch": 1.2369927490141204, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.029266357421875, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8365188837051392, "num_tokens": 371093897.0, "step": 9724 }, { "epoch": 1.237119959292711, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.5282096862793, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8572852611541748, "num_tokens": 371132390.0, "step": 9725 }, { "epoch": 1.2372471695713014, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.86687088012695, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8700103759765625, "num_tokens": 371170468.0, "step": 9726 }, { "epoch": 1.237374379849892, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.47808837890625, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8560553789138794, "num_tokens": 371209323.0, "step": 9727 }, { "epoch": 1.2375015901284825, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.196922302246094, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8605104684829712, "num_tokens": 371246526.0, "step": 9728 }, { "epoch": 1.237628800407073, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.27458953857422, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.861504077911377, "num_tokens": 371282850.0, "step": 9729 }, { "epoch": 1.2377560106856633, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.933101654052734, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8668141961097717, "num_tokens": 371312614.0, "step": 9730 }, { "epoch": 1.2378832209642538, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.79378128051758, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8607975244522095, "num_tokens": 371350170.0, "step": 9731 }, { "epoch": 1.2380104312428444, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.49859619140625, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8658926486968994, "num_tokens": 371389295.0, "step": 9732 }, { "epoch": 1.238137641521435, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.276702880859375, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8597387671470642, "num_tokens": 371430579.0, "step": 9733 }, { "epoch": 1.2382648518000254, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.77460861206055, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8499633073806763, "num_tokens": 371465087.0, "step": 9734 }, { "epoch": 1.238392062078616, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.235530853271484, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8670326471328735, "num_tokens": 371503549.0, "step": 9735 }, { "epoch": 1.2385192723572065, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.45328140258789, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8782549500465393, "num_tokens": 371542953.0, "step": 9736 }, { "epoch": 1.238646482635797, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.31632995605469, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8620076775550842, "num_tokens": 371582541.0, "step": 9737 }, { "epoch": 1.2387736929143875, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.50310516357422, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8698561191558838, "num_tokens": 371617221.0, "step": 9738 }, { "epoch": 1.238900903192978, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.08668518066406, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.863095223903656, "num_tokens": 371660773.0, "step": 9739 }, { "epoch": 1.2390281134715684, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.6461181640625, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8539867401123047, "num_tokens": 371694763.0, "step": 9740 }, { "epoch": 1.239155323750159, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.05182647705078, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8587868809700012, "num_tokens": 371734793.0, "step": 9741 }, { "epoch": 1.2392825340287494, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.779998779296875, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8538825511932373, "num_tokens": 371775691.0, "step": 9742 }, { "epoch": 1.23940974430734, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.717594146728516, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8506201505661011, "num_tokens": 371817319.0, "step": 9743 }, { "epoch": 1.2395369545859305, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.5038948059082, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.859142541885376, "num_tokens": 371847431.0, "step": 9744 }, { "epoch": 1.239664164864521, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.088539123535156, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8719788193702698, "num_tokens": 371881788.0, "step": 9745 }, { "epoch": 1.2397913751431116, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.26545333862305, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8529211282730103, "num_tokens": 371917388.0, "step": 9746 }, { "epoch": 1.239918585421702, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.06948471069336, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8680846095085144, "num_tokens": 371957561.0, "step": 9747 }, { "epoch": 1.2400457957002926, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.55002212524414, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8516186475753784, "num_tokens": 371987322.0, "step": 9748 }, { "epoch": 1.2401730059788831, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.8394889831543, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8618875741958618, "num_tokens": 372027263.0, "step": 9749 }, { "epoch": 1.2403002162574737, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.73316192626953, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8632810115814209, "num_tokens": 372061441.0, "step": 9750 }, { "epoch": 1.2404274265360642, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.5078239440918, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8667245507240295, "num_tokens": 372099085.0, "step": 9751 }, { "epoch": 1.2405546368146547, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.356685638427734, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8610891699790955, "num_tokens": 372134106.0, "step": 9752 }, { "epoch": 1.2406818470932452, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.573150634765625, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8669542074203491, "num_tokens": 372164635.0, "step": 9753 }, { "epoch": 1.2408090573718356, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.23338317871094, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8651400208473206, "num_tokens": 372204924.0, "step": 9754 }, { "epoch": 1.240936267650426, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.74576187133789, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8601418733596802, "num_tokens": 372240939.0, "step": 9755 }, { "epoch": 1.2410634779290166, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.84267044067383, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8543190360069275, "num_tokens": 372272355.0, "step": 9756 }, { "epoch": 1.2411906882076071, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.80723571777344, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8583632707595825, "num_tokens": 372309566.0, "step": 9757 }, { "epoch": 1.2413178984861977, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.608848571777344, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8533320426940918, "num_tokens": 372347768.0, "step": 9758 }, { "epoch": 1.2414451087647882, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.853973388671875, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8679407238960266, "num_tokens": 372381020.0, "step": 9759 }, { "epoch": 1.2415723190433787, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.61710739135742, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8478060960769653, "num_tokens": 372422434.0, "step": 9760 }, { "epoch": 1.2416995293219693, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.48653793334961, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8499967455863953, "num_tokens": 372459342.0, "step": 9761 }, { "epoch": 1.2418267396005598, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.03077697753906, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.863429069519043, "num_tokens": 372496637.0, "step": 9762 }, { "epoch": 1.2419539498791503, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.71510696411133, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8721641302108765, "num_tokens": 372532068.0, "step": 9763 }, { "epoch": 1.2420811601577408, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.73623275756836, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8684473037719727, "num_tokens": 372565460.0, "step": 9764 }, { "epoch": 1.2422083704363311, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.59347915649414, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8639752268791199, "num_tokens": 372605998.0, "step": 9765 }, { "epoch": 1.2423355807149217, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.82427978515625, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8498002886772156, "num_tokens": 372642038.0, "step": 9766 }, { "epoch": 1.2424627909935122, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.617794036865234, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8542149066925049, "num_tokens": 372678192.0, "step": 9767 }, { "epoch": 1.2425900012721027, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.037017822265625, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8540711402893066, "num_tokens": 372712867.0, "step": 9768 }, { "epoch": 1.2427172115506933, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.54957962036133, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8411228656768799, "num_tokens": 372745790.0, "step": 9769 }, { "epoch": 1.2428444218292838, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.41273880004883, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8686714172363281, "num_tokens": 372777634.0, "step": 9770 }, { "epoch": 1.2429716321078743, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.53477096557617, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8445107340812683, "num_tokens": 372820527.0, "step": 9771 }, { "epoch": 1.2430988423864648, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.1621208190918, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8704126477241516, "num_tokens": 372858479.0, "step": 9772 }, { "epoch": 1.2432260526650554, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.633949279785156, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8662182688713074, "num_tokens": 372895219.0, "step": 9773 }, { "epoch": 1.243353262943646, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.956600189208984, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8718699216842651, "num_tokens": 372934491.0, "step": 9774 }, { "epoch": 1.2434804732222364, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.793827056884766, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8508496284484863, "num_tokens": 372972892.0, "step": 9775 }, { "epoch": 1.243607683500827, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.26660919189453, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8629104495048523, "num_tokens": 373011564.0, "step": 9776 }, { "epoch": 1.2437348937794175, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.67314529418945, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8618474006652832, "num_tokens": 373049444.0, "step": 9777 }, { "epoch": 1.243862104058008, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.08732223510742, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8596383929252625, "num_tokens": 373085466.0, "step": 9778 }, { "epoch": 1.2439893143365983, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.54115295410156, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8373332023620605, "num_tokens": 373124712.0, "step": 9779 }, { "epoch": 1.2441165246151888, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.006649017333984, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.863994836807251, "num_tokens": 373160008.0, "step": 9780 }, { "epoch": 1.2442437348937794, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.732906341552734, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.859002947807312, "num_tokens": 373192110.0, "step": 9781 }, { "epoch": 1.24437094517237, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.96873474121094, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8612166047096252, "num_tokens": 373234432.0, "step": 9782 }, { "epoch": 1.2444981554509604, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.956214904785156, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8506808280944824, "num_tokens": 373270725.0, "step": 9783 }, { "epoch": 1.244625365729551, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.6370964050293, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8372533917427063, "num_tokens": 373307531.0, "step": 9784 }, { "epoch": 1.2447525760081415, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.474212646484375, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8630362749099731, "num_tokens": 373351657.0, "step": 9785 }, { "epoch": 1.244879786286732, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.04823684692383, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8634141087532043, "num_tokens": 373388811.0, "step": 9786 }, { "epoch": 1.2450069965653225, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.80891418457031, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8689963817596436, "num_tokens": 373423378.0, "step": 9787 }, { "epoch": 1.245134206843913, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.555091857910156, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8611135482788086, "num_tokens": 373460982.0, "step": 9788 }, { "epoch": 1.2452614171225034, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.716575622558594, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8662769198417664, "num_tokens": 373501474.0, "step": 9789 }, { "epoch": 1.245388627401094, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.58655548095703, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8637549877166748, "num_tokens": 373537830.0, "step": 9790 }, { "epoch": 1.2455158376796844, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.736263275146484, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8523412942886353, "num_tokens": 373573975.0, "step": 9791 }, { "epoch": 1.245643047958275, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.64268493652344, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8373221158981323, "num_tokens": 373612157.0, "step": 9792 }, { "epoch": 1.2457702582368655, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.82147216796875, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8779087066650391, "num_tokens": 373645065.0, "step": 9793 }, { "epoch": 1.245897468515456, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.809356689453125, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8541135191917419, "num_tokens": 373687658.0, "step": 9794 }, { "epoch": 1.2460246787940465, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.75419235229492, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8603258728981018, "num_tokens": 373732262.0, "step": 9795 }, { "epoch": 1.246151889072637, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 40.112998962402344, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8407251834869385, "num_tokens": 373772410.0, "step": 9796 }, { "epoch": 1.2462790993512276, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.443023681640625, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8591195344924927, "num_tokens": 373808773.0, "step": 9797 }, { "epoch": 1.2464063096298181, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.550724029541016, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8549752235412598, "num_tokens": 373848945.0, "step": 9798 }, { "epoch": 1.2465335199084087, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.18576431274414, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8536080121994019, "num_tokens": 373892716.0, "step": 9799 }, { "epoch": 1.2466607301869992, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.81452941894531, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8750922679901123, "num_tokens": 373929443.0, "step": 9800 }, { "epoch": 1.2467879404655897, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.40754699707031, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8603556752204895, "num_tokens": 373968882.0, "step": 9801 }, { "epoch": 1.2469151507441802, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.26168441772461, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8727608919143677, "num_tokens": 374009265.0, "step": 9802 }, { "epoch": 1.2470423610227706, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.75191116333008, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8435693383216858, "num_tokens": 374047152.0, "step": 9803 }, { "epoch": 1.247169571301361, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.83073806762695, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.855191707611084, "num_tokens": 374083127.0, "step": 9804 }, { "epoch": 1.2472967815799516, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.988182067871094, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8689005374908447, "num_tokens": 374127086.0, "step": 9805 }, { "epoch": 1.2474239918585421, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.9658317565918, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8539372682571411, "num_tokens": 374163879.0, "step": 9806 }, { "epoch": 1.2475512021371327, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.703765869140625, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8652511835098267, "num_tokens": 374206639.0, "step": 9807 }, { "epoch": 1.2476784124157232, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.96657180786133, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8566762208938599, "num_tokens": 374244106.0, "step": 9808 }, { "epoch": 1.2478056226943137, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.93225860595703, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.856093168258667, "num_tokens": 374285776.0, "step": 9809 }, { "epoch": 1.2479328329729042, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.56199645996094, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8685451149940491, "num_tokens": 374323036.0, "step": 9810 }, { "epoch": 1.2480600432514948, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.06521987915039, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8657386898994446, "num_tokens": 374360204.0, "step": 9811 }, { "epoch": 1.2481872535300853, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.65777587890625, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8709186315536499, "num_tokens": 374394961.0, "step": 9812 }, { "epoch": 1.2483144638086758, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.750244140625, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8620269298553467, "num_tokens": 374436142.0, "step": 9813 }, { "epoch": 1.2484416740872661, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.84539794921875, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8547727465629578, "num_tokens": 374474263.0, "step": 9814 }, { "epoch": 1.2485688843658567, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.74734878540039, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.853812038898468, "num_tokens": 374510250.0, "step": 9815 }, { "epoch": 1.2486960946444472, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.2636604309082, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8782360553741455, "num_tokens": 374549579.0, "step": 9816 }, { "epoch": 1.2488233049230377, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.5499267578125, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8582298755645752, "num_tokens": 374588631.0, "step": 9817 }, { "epoch": 1.2489505152016283, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.202842712402344, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8559740781784058, "num_tokens": 374627092.0, "step": 9818 }, { "epoch": 1.2490777254802188, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.0899543762207, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8657006025314331, "num_tokens": 374657797.0, "step": 9819 }, { "epoch": 1.2492049357588093, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.009925842285156, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8642207384109497, "num_tokens": 374694011.0, "step": 9820 }, { "epoch": 1.2493321460373998, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.83179473876953, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8625005483627319, "num_tokens": 374735127.0, "step": 9821 }, { "epoch": 1.2494593563159904, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.69731140136719, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8566022515296936, "num_tokens": 374774170.0, "step": 9822 }, { "epoch": 1.249586566594581, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.17535400390625, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8602449893951416, "num_tokens": 374809162.0, "step": 9823 }, { "epoch": 1.2497137768731714, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.52759552001953, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8523014783859253, "num_tokens": 374847974.0, "step": 9824 }, { "epoch": 1.249840987151762, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.19398880004883, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8744046688079834, "num_tokens": 374894911.0, "step": 9825 }, { "epoch": 1.2499681974303525, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.5233039855957, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.853773832321167, "num_tokens": 374933534.0, "step": 9826 }, { "epoch": 1.250095407708943, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.194496154785156, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8471519947052002, "num_tokens": 374968183.0, "step": 9827 }, { "epoch": 1.2502226179875333, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.428279876708984, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8540436625480652, "num_tokens": 375005973.0, "step": 9828 }, { "epoch": 1.2503498282661238, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.47435760498047, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8528714776039124, "num_tokens": 375043446.0, "step": 9829 }, { "epoch": 1.2504770385447144, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.486698150634766, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8629770278930664, "num_tokens": 375081518.0, "step": 9830 }, { "epoch": 1.250604248823305, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.102210998535156, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8517966270446777, "num_tokens": 375117767.0, "step": 9831 }, { "epoch": 1.2507314591018954, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.627723693847656, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8364577889442444, "num_tokens": 375152061.0, "step": 9832 }, { "epoch": 1.250858669380486, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.19668960571289, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8527618646621704, "num_tokens": 375195677.0, "step": 9833 }, { "epoch": 1.2509858796590765, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.89522171020508, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8521060347557068, "num_tokens": 375230319.0, "step": 9834 }, { "epoch": 1.251113089937667, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 37.93885803222656, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8640077710151672, "num_tokens": 375268970.0, "step": 9835 }, { "epoch": 1.2512403002162575, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.51639938354492, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8674513101577759, "num_tokens": 375303661.0, "step": 9836 }, { "epoch": 1.2513675104948478, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.88248825073242, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8675626516342163, "num_tokens": 375343278.0, "step": 9837 }, { "epoch": 1.2514947207734384, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.22259521484375, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8668786287307739, "num_tokens": 375378574.0, "step": 9838 }, { "epoch": 1.251621931052029, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.09440612792969, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8538109064102173, "num_tokens": 375412781.0, "step": 9839 }, { "epoch": 1.2517491413306194, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.295352935791016, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8711715936660767, "num_tokens": 375450431.0, "step": 9840 }, { "epoch": 1.25187635160921, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.94525909423828, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8537397384643555, "num_tokens": 375490692.0, "step": 9841 }, { "epoch": 1.2520035618878005, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.29391860961914, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8653649091720581, "num_tokens": 375535278.0, "step": 9842 }, { "epoch": 1.252130772166391, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.96720504760742, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8534239530563354, "num_tokens": 375575859.0, "step": 9843 }, { "epoch": 1.2522579824449815, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.82583236694336, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8522464036941528, "num_tokens": 375611084.0, "step": 9844 }, { "epoch": 1.252385192723572, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.23244094848633, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8694913387298584, "num_tokens": 375644926.0, "step": 9845 }, { "epoch": 1.2525124030021626, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.781898498535156, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.867796778678894, "num_tokens": 375686970.0, "step": 9846 }, { "epoch": 1.2526396132807531, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.126346588134766, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8694987893104553, "num_tokens": 375727372.0, "step": 9847 }, { "epoch": 1.2527668235593437, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.980926513671875, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8487781882286072, "num_tokens": 375763568.0, "step": 9848 }, { "epoch": 1.2528940338379342, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.553810119628906, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8433725833892822, "num_tokens": 375797954.0, "step": 9849 }, { "epoch": 1.2530212441165247, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.16808319091797, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8571176528930664, "num_tokens": 375836362.0, "step": 9850 }, { "epoch": 1.2531484543951152, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.21348571777344, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8543254137039185, "num_tokens": 375879403.0, "step": 9851 }, { "epoch": 1.2532756646737058, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.80537796020508, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8627992868423462, "num_tokens": 375916728.0, "step": 9852 }, { "epoch": 1.253402874952296, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.001564025878906, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8538904190063477, "num_tokens": 375951743.0, "step": 9853 }, { "epoch": 1.2535300852308866, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.87657165527344, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8332870006561279, "num_tokens": 375988872.0, "step": 9854 }, { "epoch": 1.2536572955094771, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.879554748535156, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8486278653144836, "num_tokens": 376030388.0, "step": 9855 }, { "epoch": 1.2537845057880677, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.06932830810547, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8696187138557434, "num_tokens": 376062975.0, "step": 9856 }, { "epoch": 1.2539117160666582, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.11308288574219, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8508514761924744, "num_tokens": 376101873.0, "step": 9857 }, { "epoch": 1.2540389263452487, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.75328826904297, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8368716239929199, "num_tokens": 376138774.0, "step": 9858 }, { "epoch": 1.2541661366238392, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.45925521850586, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8509607315063477, "num_tokens": 376172357.0, "step": 9859 }, { "epoch": 1.2542933469024298, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.66263961791992, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8481036424636841, "num_tokens": 376212977.0, "step": 9860 }, { "epoch": 1.2544205571810203, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.568416595458984, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8493408560752869, "num_tokens": 376256989.0, "step": 9861 }, { "epoch": 1.2545477674596106, "ewc_loss": 0.1201171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010061264038085938, "grad_norm": 37.375022888183594, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8631138205528259, "num_tokens": 376300130.0, "step": 9862 }, { "epoch": 1.2546749777382011, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 39.03403854370117, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8623719215393066, "num_tokens": 376337475.0, "step": 9863 }, { "epoch": 1.2548021880167917, "ewc_loss": 0.11962890625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010013580322265625, "grad_norm": 37.51982498168945, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8499881625175476, "num_tokens": 376376485.0, "step": 9864 }, { "epoch": 1.2549293982953822, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.666748046875, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8652060627937317, "num_tokens": 376413393.0, "step": 9865 }, { "epoch": 1.2550566085739727, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.49847412109375, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8488413095474243, "num_tokens": 376448192.0, "step": 9866 }, { "epoch": 1.2551838188525632, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.823978424072266, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8615000247955322, "num_tokens": 376489382.0, "step": 9867 }, { "epoch": 1.2553110291311538, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.92567443847656, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.863517701625824, "num_tokens": 376527221.0, "step": 9868 }, { "epoch": 1.2554382394097443, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.27107238769531, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8547053337097168, "num_tokens": 376567045.0, "step": 9869 }, { "epoch": 1.2555654496883348, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.881534576416016, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8658664226531982, "num_tokens": 376607039.0, "step": 9870 }, { "epoch": 1.2556926599669254, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.56085205078125, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8606631755828857, "num_tokens": 376643220.0, "step": 9871 }, { "epoch": 1.255819870245516, "ewc_loss": 0.12060546875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001010894775390625, "grad_norm": 37.969757080078125, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8787118196487427, "num_tokens": 376681417.0, "step": 9872 }, { "epoch": 1.2559470805241064, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.37527084350586, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.877203106880188, "num_tokens": 376722836.0, "step": 9873 }, { "epoch": 1.256074290802697, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.885807037353516, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8521610498428345, "num_tokens": 376763194.0, "step": 9874 }, { "epoch": 1.2562015010812875, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.4813346862793, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8550702929496765, "num_tokens": 376803701.0, "step": 9875 }, { "epoch": 1.256328711359878, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.52747344970703, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8670912981033325, "num_tokens": 376835296.0, "step": 9876 }, { "epoch": 1.2564559216384683, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.65611267089844, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8621605634689331, "num_tokens": 376876317.0, "step": 9877 }, { "epoch": 1.2565831319170588, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.44707489013672, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8582895994186401, "num_tokens": 376916403.0, "step": 9878 }, { "epoch": 1.2567103421956494, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.32207107543945, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8683226704597473, "num_tokens": 376956079.0, "step": 9879 }, { "epoch": 1.25683755247424, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.32320785522461, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.850287914276123, "num_tokens": 376991618.0, "step": 9880 }, { "epoch": 1.2569647627528304, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.98921203613281, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8653643727302551, "num_tokens": 377033964.0, "step": 9881 }, { "epoch": 1.257091973031421, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.42920684814453, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8661982417106628, "num_tokens": 377070091.0, "step": 9882 }, { "epoch": 1.2572191833100115, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.88896560668945, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8672832250595093, "num_tokens": 377111185.0, "step": 9883 }, { "epoch": 1.257346393588602, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.32352828979492, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8635356426239014, "num_tokens": 377147637.0, "step": 9884 }, { "epoch": 1.2574736038671925, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.1409912109375, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8585683107376099, "num_tokens": 377181918.0, "step": 9885 }, { "epoch": 1.2576008141457828, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.02408981323242, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8515790700912476, "num_tokens": 377222900.0, "step": 9886 }, { "epoch": 1.2577280244243734, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.36161422729492, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8731554746627808, "num_tokens": 377263107.0, "step": 9887 }, { "epoch": 1.257855234702964, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.85791015625, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8746535181999207, "num_tokens": 377301384.0, "step": 9888 }, { "epoch": 1.2579824449815544, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.19719696044922, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8528394103050232, "num_tokens": 377339865.0, "step": 9889 }, { "epoch": 1.258109655260145, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 39.02338790893555, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8673714399337769, "num_tokens": 377381582.0, "step": 9890 }, { "epoch": 1.2582368655387355, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.789512634277344, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.835430383682251, "num_tokens": 377425548.0, "step": 9891 }, { "epoch": 1.258364075817326, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 39.09827423095703, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8585755228996277, "num_tokens": 377462298.0, "step": 9892 }, { "epoch": 1.2584912860959165, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.833740234375, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8520851731300354, "num_tokens": 377503873.0, "step": 9893 }, { "epoch": 1.258618496374507, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.98594284057617, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8739787340164185, "num_tokens": 377540497.0, "step": 9894 }, { "epoch": 1.2587457066530976, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.9315071105957, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8590901494026184, "num_tokens": 377582242.0, "step": 9895 }, { "epoch": 1.2588729169316881, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.22895812988281, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8749913573265076, "num_tokens": 377614972.0, "step": 9896 }, { "epoch": 1.2590001272102787, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.11592102050781, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8573352694511414, "num_tokens": 377645392.0, "step": 9897 }, { "epoch": 1.2591273374888692, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.34681701660156, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8671419620513916, "num_tokens": 377680317.0, "step": 9898 }, { "epoch": 1.2592545477674597, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.277645111083984, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8587177991867065, "num_tokens": 377714879.0, "step": 9899 }, { "epoch": 1.2593817580460502, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.36634063720703, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.844490110874176, "num_tokens": 377751404.0, "step": 9900 }, { "epoch": 1.2595089683246408, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.21931457519531, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8640447854995728, "num_tokens": 377788354.0, "step": 9901 }, { "epoch": 1.259636178603231, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.983375549316406, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8486766815185547, "num_tokens": 377827259.0, "step": 9902 }, { "epoch": 1.2597633888818216, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.62632751464844, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8519797325134277, "num_tokens": 377872477.0, "step": 9903 }, { "epoch": 1.2598905991604121, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.238338470458984, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8512248992919922, "num_tokens": 377911159.0, "step": 9904 }, { "epoch": 1.2600178094390027, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.525489807128906, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8623437881469727, "num_tokens": 377951919.0, "step": 9905 }, { "epoch": 1.2601450197175932, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 38.03400802612305, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8413990139961243, "num_tokens": 377997567.0, "step": 9906 }, { "epoch": 1.2602722299961837, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.88874816894531, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8576415777206421, "num_tokens": 378033811.0, "step": 9907 }, { "epoch": 1.2603994402747742, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.6071891784668, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8476208448410034, "num_tokens": 378070136.0, "step": 9908 }, { "epoch": 1.2605266505533648, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.996063232421875, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8562582731246948, "num_tokens": 378106902.0, "step": 9909 }, { "epoch": 1.2606538608319553, "ewc_loss": 0.12109375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010156631469726562, "grad_norm": 37.98295974731445, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8633109331130981, "num_tokens": 378144638.0, "step": 9910 }, { "epoch": 1.2607810711105456, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.267425537109375, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8681163787841797, "num_tokens": 378179059.0, "step": 9911 }, { "epoch": 1.2609082813891361, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.39117431640625, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.843934178352356, "num_tokens": 378210346.0, "step": 9912 }, { "epoch": 1.2610354916677267, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.2468147277832, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8647263050079346, "num_tokens": 378242687.0, "step": 9913 }, { "epoch": 1.2611627019463172, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.173606872558594, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8655656576156616, "num_tokens": 378279421.0, "step": 9914 }, { "epoch": 1.2612899122249077, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.441837310791016, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8552238941192627, "num_tokens": 378318133.0, "step": 9915 }, { "epoch": 1.2614171225034982, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.06633758544922, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8422730565071106, "num_tokens": 378362987.0, "step": 9916 }, { "epoch": 1.2615443327820888, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.63993835449219, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8763583898544312, "num_tokens": 378396197.0, "step": 9917 }, { "epoch": 1.2616715430606793, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.036827087402344, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8671828508377075, "num_tokens": 378433691.0, "step": 9918 }, { "epoch": 1.2617987533392698, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.81181716918945, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.862472653388977, "num_tokens": 378470277.0, "step": 9919 }, { "epoch": 1.2619259636178604, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.806949615478516, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.852647066116333, "num_tokens": 378510500.0, "step": 9920 }, { "epoch": 1.2620531738964509, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.55924987792969, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8584826588630676, "num_tokens": 378552893.0, "step": 9921 }, { "epoch": 1.2621803841750414, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.1356086730957, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.865801215171814, "num_tokens": 378590917.0, "step": 9922 }, { "epoch": 1.262307594453632, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.50628662109375, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8668960332870483, "num_tokens": 378637764.0, "step": 9923 }, { "epoch": 1.2624348047322225, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.805511474609375, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8545586466789246, "num_tokens": 378675497.0, "step": 9924 }, { "epoch": 1.262562015010813, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.98094940185547, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8504059314727783, "num_tokens": 378710470.0, "step": 9925 }, { "epoch": 1.2626892252894033, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.69819259643555, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8555927276611328, "num_tokens": 378758116.0, "step": 9926 }, { "epoch": 1.2628164355679938, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.29118347167969, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8554188013076782, "num_tokens": 378798166.0, "step": 9927 }, { "epoch": 1.2629436458465844, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.43526077270508, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8752472400665283, "num_tokens": 378839899.0, "step": 9928 }, { "epoch": 1.263070856125175, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 37.91200637817383, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8611152172088623, "num_tokens": 378886022.0, "step": 9929 }, { "epoch": 1.2631980664037654, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.28329849243164, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8488650321960449, "num_tokens": 378925097.0, "step": 9930 }, { "epoch": 1.263325276682356, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.280975341796875, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8714393973350525, "num_tokens": 378965242.0, "step": 9931 }, { "epoch": 1.2634524869609465, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.16373062133789, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8378590941429138, "num_tokens": 379005932.0, "step": 9932 }, { "epoch": 1.263579697239537, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.705894470214844, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8563042283058167, "num_tokens": 379045817.0, "step": 9933 }, { "epoch": 1.2637069075181275, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.78218460083008, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8530388474464417, "num_tokens": 379086481.0, "step": 9934 }, { "epoch": 1.2638341177967178, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.45587921142578, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8414615392684937, "num_tokens": 379129782.0, "step": 9935 }, { "epoch": 1.2639613280753084, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.22574234008789, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8559326529502869, "num_tokens": 379171525.0, "step": 9936 }, { "epoch": 1.264088538353899, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 37.943946838378906, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.851757824420929, "num_tokens": 379213887.0, "step": 9937 }, { "epoch": 1.2642157486324894, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 37.98035430908203, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8600214719772339, "num_tokens": 379252634.0, "step": 9938 }, { "epoch": 1.26434295891108, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.485225677490234, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8574671745300293, "num_tokens": 379287402.0, "step": 9939 }, { "epoch": 1.2644701691896705, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.386741638183594, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.864596426486969, "num_tokens": 379325592.0, "step": 9940 }, { "epoch": 1.264597379468261, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.33778381347656, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8456305861473083, "num_tokens": 379359010.0, "step": 9941 }, { "epoch": 1.2647245897468515, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.13730239868164, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8519032597541809, "num_tokens": 379399783.0, "step": 9942 }, { "epoch": 1.264851800025442, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.553466796875, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8616502285003662, "num_tokens": 379442207.0, "step": 9943 }, { "epoch": 1.2649790103040326, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.33404541015625, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8579850196838379, "num_tokens": 379479561.0, "step": 9944 }, { "epoch": 1.2651062205826231, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 37.83610534667969, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8653628826141357, "num_tokens": 379516413.0, "step": 9945 }, { "epoch": 1.2652334308612136, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.81397247314453, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8491869568824768, "num_tokens": 379557993.0, "step": 9946 }, { "epoch": 1.2653606411398042, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 37.96175003051758, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8393513560295105, "num_tokens": 379594028.0, "step": 9947 }, { "epoch": 1.2654878514183947, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.35328674316406, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8518207669258118, "num_tokens": 379631235.0, "step": 9948 }, { "epoch": 1.2656150616969852, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.15367889404297, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8467618227005005, "num_tokens": 379668714.0, "step": 9949 }, { "epoch": 1.2657422719755758, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.685157775878906, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8649362325668335, "num_tokens": 379710233.0, "step": 9950 }, { "epoch": 1.265869482254166, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.009307861328125, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8550046682357788, "num_tokens": 379746208.0, "step": 9951 }, { "epoch": 1.2659966925327566, "ewc_loss": 0.125, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.20275115966797, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8680169582366943, "num_tokens": 379796420.0, "step": 9952 }, { "epoch": 1.2661239028113471, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.05440139770508, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8588085174560547, "num_tokens": 379830027.0, "step": 9953 }, { "epoch": 1.2662511130899377, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.193336486816406, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8589398264884949, "num_tokens": 379871751.0, "step": 9954 }, { "epoch": 1.2663783233685282, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 37.99369812011719, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8531770706176758, "num_tokens": 379908255.0, "step": 9955 }, { "epoch": 1.2665055336471187, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.81187438964844, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8613172769546509, "num_tokens": 379944502.0, "step": 9956 }, { "epoch": 1.2666327439257092, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 37.97112274169922, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8565744161605835, "num_tokens": 379981929.0, "step": 9957 }, { "epoch": 1.2667599542042998, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.3764533996582, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8530009984970093, "num_tokens": 380018876.0, "step": 9958 }, { "epoch": 1.2668871644828903, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.17966079711914, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8549317121505737, "num_tokens": 380054153.0, "step": 9959 }, { "epoch": 1.2670143747614806, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.1276969909668, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8504001498222351, "num_tokens": 380086550.0, "step": 9960 }, { "epoch": 1.2671415850400711, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.37446212768555, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8540425300598145, "num_tokens": 380120308.0, "step": 9961 }, { "epoch": 1.2672687953186617, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 37.98952865600586, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8411777019500732, "num_tokens": 380160204.0, "step": 9962 }, { "epoch": 1.2673960055972522, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.08781433105469, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8621841669082642, "num_tokens": 380196687.0, "step": 9963 }, { "epoch": 1.2675232158758427, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.70016098022461, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8732295632362366, "num_tokens": 380237451.0, "step": 9964 }, { "epoch": 1.2676504261544332, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 37.87306594848633, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8557107448577881, "num_tokens": 380276631.0, "step": 9965 }, { "epoch": 1.2677776364330238, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.76679992675781, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8625308871269226, "num_tokens": 380319432.0, "step": 9966 }, { "epoch": 1.2679048467116143, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.84649658203125, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.867007851600647, "num_tokens": 380356666.0, "step": 9967 }, { "epoch": 1.2680320569902048, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.015777587890625, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8681657910346985, "num_tokens": 380390347.0, "step": 9968 }, { "epoch": 1.2681592672687954, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.50960159301758, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8610844612121582, "num_tokens": 380424083.0, "step": 9969 }, { "epoch": 1.2682864775473859, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.82609558105469, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8537186980247498, "num_tokens": 380471179.0, "step": 9970 }, { "epoch": 1.2684136878259764, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.33489990234375, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8603216409683228, "num_tokens": 380510291.0, "step": 9971 }, { "epoch": 1.268540898104567, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.60760498046875, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8628181219100952, "num_tokens": 380556297.0, "step": 9972 }, { "epoch": 1.2686681083831575, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.34877014160156, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8698872327804565, "num_tokens": 380592113.0, "step": 9973 }, { "epoch": 1.268795318661748, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.33931350708008, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.872850775718689, "num_tokens": 380623942.0, "step": 9974 }, { "epoch": 1.2689225289403383, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.40912628173828, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8511247634887695, "num_tokens": 380662300.0, "step": 9975 }, { "epoch": 1.2690497392189288, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.73191833496094, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8608192205429077, "num_tokens": 380701452.0, "step": 9976 }, { "epoch": 1.2691769494975194, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.02104187011719, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8694473505020142, "num_tokens": 380741248.0, "step": 9977 }, { "epoch": 1.2693041597761099, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.839027404785156, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8529244661331177, "num_tokens": 380781455.0, "step": 9978 }, { "epoch": 1.2694313700547004, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.32449722290039, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8496479392051697, "num_tokens": 380818139.0, "step": 9979 }, { "epoch": 1.269558580333291, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.41933822631836, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.864592432975769, "num_tokens": 380860530.0, "step": 9980 }, { "epoch": 1.2696857906118815, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.29862976074219, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8717714548110962, "num_tokens": 380900825.0, "step": 9981 }, { "epoch": 1.269813000890472, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.533939361572266, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8558281064033508, "num_tokens": 380939270.0, "step": 9982 }, { "epoch": 1.2699402111690625, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.22637939453125, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8369699120521545, "num_tokens": 380981708.0, "step": 9983 }, { "epoch": 1.2700674214476528, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.6484375, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8637621998786926, "num_tokens": 381015186.0, "step": 9984 }, { "epoch": 1.2701946317262434, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.282981872558594, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8689004778862, "num_tokens": 381055503.0, "step": 9985 }, { "epoch": 1.270321842004834, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.47645950317383, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8708515167236328, "num_tokens": 381091935.0, "step": 9986 }, { "epoch": 1.2704490522834244, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.29032897949219, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8641681671142578, "num_tokens": 381127951.0, "step": 9987 }, { "epoch": 1.270576262562015, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.4387321472168, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8534519672393799, "num_tokens": 381161918.0, "step": 9988 }, { "epoch": 1.2707034728406055, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.58120346069336, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8467420339584351, "num_tokens": 381198747.0, "step": 9989 }, { "epoch": 1.270830683119196, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.048606872558594, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8592888116836548, "num_tokens": 381242714.0, "step": 9990 }, { "epoch": 1.2709578933977865, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.8546142578125, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8696227073669434, "num_tokens": 381274667.0, "step": 9991 }, { "epoch": 1.271085103676377, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.99712371826172, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8694875836372375, "num_tokens": 381318110.0, "step": 9992 }, { "epoch": 1.2712123139549676, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.390865325927734, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8591320514678955, "num_tokens": 381356694.0, "step": 9993 }, { "epoch": 1.2713395242335581, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.170799255371094, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8475443124771118, "num_tokens": 381395218.0, "step": 9994 }, { "epoch": 1.2714667345121486, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.222782135009766, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8722220063209534, "num_tokens": 381429195.0, "step": 9995 }, { "epoch": 1.2715939447907392, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.17622375488281, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8531678915023804, "num_tokens": 381468276.0, "step": 9996 }, { "epoch": 1.2717211550693297, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.16130828857422, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8671736717224121, "num_tokens": 381503400.0, "step": 9997 }, { "epoch": 1.2718483653479202, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.183406829833984, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8644640445709229, "num_tokens": 381542126.0, "step": 9998 }, { "epoch": 1.2719755756265108, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.386695861816406, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8673632144927979, "num_tokens": 381579169.0, "step": 9999 }, { "epoch": 1.272102785905101, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.230255126953125, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8448649644851685, "num_tokens": 381615501.0, "step": 10000 }, { "epoch": 1.2722299961836916, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.394287109375, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8537365198135376, "num_tokens": 381649737.0, "step": 10001 }, { "epoch": 1.2723572064622821, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.01422119140625, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8621746301651001, "num_tokens": 381693776.0, "step": 10002 }, { "epoch": 1.2724844167408726, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.52595138549805, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8685653209686279, "num_tokens": 381733324.0, "step": 10003 }, { "epoch": 1.2726116270194632, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.723148345947266, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8525717258453369, "num_tokens": 381765893.0, "step": 10004 }, { "epoch": 1.2727388372980537, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.40708923339844, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8580398559570312, "num_tokens": 381802782.0, "step": 10005 }, { "epoch": 1.2728660475766442, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.87501907348633, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8629966974258423, "num_tokens": 381843448.0, "step": 10006 }, { "epoch": 1.2729932578552348, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.59307861328125, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8483666777610779, "num_tokens": 381878850.0, "step": 10007 }, { "epoch": 1.2731204681338253, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.29964828491211, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8368563652038574, "num_tokens": 381915508.0, "step": 10008 }, { "epoch": 1.2732476784124156, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.54716110229492, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.849150538444519, "num_tokens": 381952699.0, "step": 10009 }, { "epoch": 1.2733748886910061, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.49155044555664, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8516501188278198, "num_tokens": 381989947.0, "step": 10010 }, { "epoch": 1.2735020989695967, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.555660247802734, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8576676249504089, "num_tokens": 382027701.0, "step": 10011 }, { "epoch": 1.2736293092481872, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.321746826171875, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.856918454170227, "num_tokens": 382066415.0, "step": 10012 }, { "epoch": 1.2737565195267777, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.78730392456055, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8533167243003845, "num_tokens": 382108402.0, "step": 10013 }, { "epoch": 1.2738837298053682, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.17062759399414, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.860441267490387, "num_tokens": 382146865.0, "step": 10014 }, { "epoch": 1.2740109400839588, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.312049865722656, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8560121655464172, "num_tokens": 382186746.0, "step": 10015 }, { "epoch": 1.2741381503625493, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.042701721191406, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8731880187988281, "num_tokens": 382220693.0, "step": 10016 }, { "epoch": 1.2742653606411398, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.953338623046875, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8523607850074768, "num_tokens": 382259549.0, "step": 10017 }, { "epoch": 1.2743925709197303, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.3089485168457, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8691240549087524, "num_tokens": 382299164.0, "step": 10018 }, { "epoch": 1.2745197811983209, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.619205474853516, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8724517822265625, "num_tokens": 382333596.0, "step": 10019 }, { "epoch": 1.2746469914769114, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 37.95553970336914, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8608335852622986, "num_tokens": 382372321.0, "step": 10020 }, { "epoch": 1.274774201755502, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.64933776855469, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8623696565628052, "num_tokens": 382411125.0, "step": 10021 }, { "epoch": 1.2749014120340925, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.124778747558594, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8475731611251831, "num_tokens": 382450935.0, "step": 10022 }, { "epoch": 1.275028622312683, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.97711944580078, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8567103147506714, "num_tokens": 382488232.0, "step": 10023 }, { "epoch": 1.2751558325912733, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.074886322021484, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8544418811798096, "num_tokens": 382522280.0, "step": 10024 }, { "epoch": 1.2752830428698638, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.6415901184082, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8687734603881836, "num_tokens": 382562788.0, "step": 10025 }, { "epoch": 1.2754102531484544, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.372039794921875, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8509641885757446, "num_tokens": 382598534.0, "step": 10026 }, { "epoch": 1.2755374634270449, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.113914489746094, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8532036542892456, "num_tokens": 382641803.0, "step": 10027 }, { "epoch": 1.2756646737056354, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.77306365966797, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8688294887542725, "num_tokens": 382677871.0, "step": 10028 }, { "epoch": 1.275791883984226, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.45334243774414, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.868349552154541, "num_tokens": 382714989.0, "step": 10029 }, { "epoch": 1.2759190942628165, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.56079864501953, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8633676767349243, "num_tokens": 382751919.0, "step": 10030 }, { "epoch": 1.276046304541407, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.411563873291016, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8712521195411682, "num_tokens": 382793550.0, "step": 10031 }, { "epoch": 1.2761735148199975, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.30886459350586, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.867583155632019, "num_tokens": 382833142.0, "step": 10032 }, { "epoch": 1.2763007250985878, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.82160949707031, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8664262294769287, "num_tokens": 382869934.0, "step": 10033 }, { "epoch": 1.2764279353771784, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.26383590698242, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8450249433517456, "num_tokens": 382909533.0, "step": 10034 }, { "epoch": 1.2765551456557689, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.68012619018555, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8653311133384705, "num_tokens": 382947462.0, "step": 10035 }, { "epoch": 1.2766823559343594, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.173484802246094, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8593745231628418, "num_tokens": 382989973.0, "step": 10036 }, { "epoch": 1.27680956621295, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.754302978515625, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8586298227310181, "num_tokens": 383033052.0, "step": 10037 }, { "epoch": 1.2769367764915405, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.43315505981445, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8571072816848755, "num_tokens": 383075981.0, "step": 10038 }, { "epoch": 1.277063986770131, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.91516876220703, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.874679446220398, "num_tokens": 383111984.0, "step": 10039 }, { "epoch": 1.2771911970487215, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.55385971069336, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8618332743644714, "num_tokens": 383155323.0, "step": 10040 }, { "epoch": 1.277318407327312, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.88966751098633, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8579816818237305, "num_tokens": 383190069.0, "step": 10041 }, { "epoch": 1.2774456176059026, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.84806442260742, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8509582877159119, "num_tokens": 383224999.0, "step": 10042 }, { "epoch": 1.2775728278844931, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.638065338134766, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8527067303657532, "num_tokens": 383267764.0, "step": 10043 }, { "epoch": 1.2777000381630836, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.64759826660156, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8690983653068542, "num_tokens": 383306148.0, "step": 10044 }, { "epoch": 1.2778272484416742, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 39.117454528808594, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8625452518463135, "num_tokens": 383343408.0, "step": 10045 }, { "epoch": 1.2779544587202647, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.64304733276367, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8832665085792542, "num_tokens": 383383626.0, "step": 10046 }, { "epoch": 1.2780816689988552, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.848018646240234, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8429151177406311, "num_tokens": 383423218.0, "step": 10047 }, { "epoch": 1.2782088792774458, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.92579650878906, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8673725128173828, "num_tokens": 383461126.0, "step": 10048 }, { "epoch": 1.278336089556036, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.57326126098633, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8509293794631958, "num_tokens": 383501808.0, "step": 10049 }, { "epoch": 1.2784632998346266, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.84654998779297, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8804060816764832, "num_tokens": 383542395.0, "step": 10050 }, { "epoch": 1.2785905101132171, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 38.673770904541016, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8566039204597473, "num_tokens": 383579968.0, "step": 10051 }, { "epoch": 1.2787177203918076, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.61112594604492, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8601574897766113, "num_tokens": 383617861.0, "step": 10052 }, { "epoch": 1.2788449306703982, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.92679214477539, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8543355464935303, "num_tokens": 383652534.0, "step": 10053 }, { "epoch": 1.2789721409489887, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.59244155883789, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8545902967453003, "num_tokens": 383693120.0, "step": 10054 }, { "epoch": 1.2790993512275792, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.413909912109375, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8505460619926453, "num_tokens": 383734958.0, "step": 10055 }, { "epoch": 1.2792265615061698, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.29585266113281, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8506357669830322, "num_tokens": 383768873.0, "step": 10056 }, { "epoch": 1.2793537717847603, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.6515998840332, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8613677024841309, "num_tokens": 383804063.0, "step": 10057 }, { "epoch": 1.2794809820633506, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.691410064697266, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8733391761779785, "num_tokens": 383843098.0, "step": 10058 }, { "epoch": 1.2796081923419411, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 39.79379653930664, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8677019476890564, "num_tokens": 383879443.0, "step": 10059 }, { "epoch": 1.2797354026205316, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.421531677246094, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8590858578681946, "num_tokens": 383918071.0, "step": 10060 }, { "epoch": 1.2798626128991222, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.960289001464844, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8518790006637573, "num_tokens": 383953052.0, "step": 10061 }, { "epoch": 1.2799898231777127, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.6650390625, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8583836555480957, "num_tokens": 383992835.0, "step": 10062 }, { "epoch": 1.2801170334563032, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.79435348510742, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8462150692939758, "num_tokens": 384031234.0, "step": 10063 }, { "epoch": 1.2802442437348938, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.48860168457031, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8499369621276855, "num_tokens": 384063611.0, "step": 10064 }, { "epoch": 1.2803714540134843, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.768463134765625, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8688987493515015, "num_tokens": 384106757.0, "step": 10065 }, { "epoch": 1.2804986642920748, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.88954544067383, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8630882501602173, "num_tokens": 384143094.0, "step": 10066 }, { "epoch": 1.2806258745706653, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.069190979003906, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8580845594406128, "num_tokens": 384181498.0, "step": 10067 }, { "epoch": 1.2807530848492559, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.965736389160156, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.862501859664917, "num_tokens": 384209784.0, "step": 10068 }, { "epoch": 1.2808802951278464, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.213809967041016, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8648437261581421, "num_tokens": 384248000.0, "step": 10069 }, { "epoch": 1.281007505406437, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.907127380371094, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8504252433776855, "num_tokens": 384290897.0, "step": 10070 }, { "epoch": 1.2811347156850275, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.662109375, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8528934717178345, "num_tokens": 384329464.0, "step": 10071 }, { "epoch": 1.281261925963618, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.83064270019531, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8616988658905029, "num_tokens": 384368915.0, "step": 10072 }, { "epoch": 1.2813891362422083, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.64572525024414, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.868796706199646, "num_tokens": 384407445.0, "step": 10073 }, { "epoch": 1.2815163465207988, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 39.01116943359375, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8554811477661133, "num_tokens": 384444384.0, "step": 10074 }, { "epoch": 1.2816435567993893, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.85468292236328, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8556521534919739, "num_tokens": 384479670.0, "step": 10075 }, { "epoch": 1.2817707670779799, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.37234115600586, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8472068905830383, "num_tokens": 384522436.0, "step": 10076 }, { "epoch": 1.2818979773565704, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.69831466674805, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8512481451034546, "num_tokens": 384559754.0, "step": 10077 }, { "epoch": 1.282025187635161, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.4587516784668, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8565105199813843, "num_tokens": 384595153.0, "step": 10078 }, { "epoch": 1.2821523979137515, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.97248458862305, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8581488132476807, "num_tokens": 384628345.0, "step": 10079 }, { "epoch": 1.282279608192342, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.3502197265625, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8537826538085938, "num_tokens": 384666725.0, "step": 10080 }, { "epoch": 1.2824068184709325, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.56536865234375, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8496378064155579, "num_tokens": 384706217.0, "step": 10081 }, { "epoch": 1.2825340287495228, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.32218551635742, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8555225133895874, "num_tokens": 384741835.0, "step": 10082 }, { "epoch": 1.2826612390281134, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 39.15425491333008, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8609115481376648, "num_tokens": 384777273.0, "step": 10083 }, { "epoch": 1.2827884493067039, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.33473205566406, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8722926378250122, "num_tokens": 384814950.0, "step": 10084 }, { "epoch": 1.2829156595852944, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.87015151977539, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.850338876247406, "num_tokens": 384848088.0, "step": 10085 }, { "epoch": 1.283042869863885, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.595027923583984, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.851417601108551, "num_tokens": 384887361.0, "step": 10086 }, { "epoch": 1.2831700801424755, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.97622299194336, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8649275898933411, "num_tokens": 384930459.0, "step": 10087 }, { "epoch": 1.283297290421066, "ewc_loss": 0.12255859375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.000102996826171875, "grad_norm": 38.40254592895508, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8530863523483276, "num_tokens": 384969506.0, "step": 10088 }, { "epoch": 1.2834245006996565, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 39.21161651611328, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8648982048034668, "num_tokens": 385004368.0, "step": 10089 }, { "epoch": 1.283551710978247, "ewc_loss": 0.12158203125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010204315185546875, "grad_norm": 37.73389434814453, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8607603311538696, "num_tokens": 385044553.0, "step": 10090 }, { "epoch": 1.2836789212568376, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.98279571533203, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8712987899780273, "num_tokens": 385076873.0, "step": 10091 }, { "epoch": 1.283806131535428, "ewc_loss": 0.1220703125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010251998901367188, "grad_norm": 37.94731521606445, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.861656904220581, "num_tokens": 385113969.0, "step": 10092 }, { "epoch": 1.2839333418140186, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.848602294921875, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.869199275970459, "num_tokens": 385149460.0, "step": 10093 }, { "epoch": 1.2840605520926092, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.34183120727539, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8418458700180054, "num_tokens": 385187240.0, "step": 10094 }, { "epoch": 1.2841877623711997, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.74334716796875, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8519982099533081, "num_tokens": 385225532.0, "step": 10095 }, { "epoch": 1.2843149726497902, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.64449691772461, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8606781959533691, "num_tokens": 385267404.0, "step": 10096 }, { "epoch": 1.2844421829283807, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.56308364868164, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8477007150650024, "num_tokens": 385314008.0, "step": 10097 }, { "epoch": 1.284569393206971, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.206077575683594, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8472963571548462, "num_tokens": 385352204.0, "step": 10098 }, { "epoch": 1.2846966034855616, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 39.127559661865234, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8512206077575684, "num_tokens": 385393307.0, "step": 10099 }, { "epoch": 1.2848238137641521, "ewc_loss": 0.123046875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010347366333007812, "grad_norm": 38.241783142089844, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8584862947463989, "num_tokens": 385426124.0, "step": 10100 }, { "epoch": 1.2849510240427426, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.76027297973633, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8630501627922058, "num_tokens": 385467471.0, "step": 10101 }, { "epoch": 1.2850782343213332, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.18196105957031, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8516824245452881, "num_tokens": 385508780.0, "step": 10102 }, { "epoch": 1.2852054445999237, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.78683853149414, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8643369674682617, "num_tokens": 385552646.0, "step": 10103 }, { "epoch": 1.2853326548785142, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.26645278930664, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8471195697784424, "num_tokens": 385586676.0, "step": 10104 }, { "epoch": 1.2854598651571048, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.71462631225586, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8465020656585693, "num_tokens": 385623201.0, "step": 10105 }, { "epoch": 1.2855870754356953, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.34129333496094, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8700935244560242, "num_tokens": 385662716.0, "step": 10106 }, { "epoch": 1.2857142857142856, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.67116165161133, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8485119342803955, "num_tokens": 385706986.0, "step": 10107 }, { "epoch": 1.2858414959928761, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.39860534667969, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8575788736343384, "num_tokens": 385744240.0, "step": 10108 }, { "epoch": 1.2859687062714666, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.526947021484375, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8421016931533813, "num_tokens": 385778583.0, "step": 10109 }, { "epoch": 1.2860959165500572, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.46795654296875, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8588274717330933, "num_tokens": 385819910.0, "step": 10110 }, { "epoch": 1.2862231268286477, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.50218963623047, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.855748176574707, "num_tokens": 385860068.0, "step": 10111 }, { "epoch": 1.2863503371072382, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.788387298583984, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8489315509796143, "num_tokens": 385890460.0, "step": 10112 }, { "epoch": 1.2864775473858288, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.24352264404297, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8553041219711304, "num_tokens": 385926174.0, "step": 10113 }, { "epoch": 1.2866047576644193, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.70826721191406, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.86806321144104, "num_tokens": 385962928.0, "step": 10114 }, { "epoch": 1.2867319679430098, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.01975631713867, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.868226170539856, "num_tokens": 386002509.0, "step": 10115 }, { "epoch": 1.2868591782216003, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.7906494140625, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8578355312347412, "num_tokens": 386043304.0, "step": 10116 }, { "epoch": 1.2869863885001909, "ewc_loss": 0.12353515625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 37.9677848815918, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8662512302398682, "num_tokens": 386081607.0, "step": 10117 }, { "epoch": 1.2871135987787814, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.70319747924805, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8765312433242798, "num_tokens": 386117762.0, "step": 10118 }, { "epoch": 1.287240809057372, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.448612213134766, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8511694073677063, "num_tokens": 386155908.0, "step": 10119 }, { "epoch": 1.2873680193359625, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.605995178222656, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8564805388450623, "num_tokens": 386199385.0, "step": 10120 }, { "epoch": 1.287495229614553, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.537353515625, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8527690172195435, "num_tokens": 386231892.0, "step": 10121 }, { "epoch": 1.2876224398931433, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.52162170410156, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8404169082641602, "num_tokens": 386269657.0, "step": 10122 }, { "epoch": 1.2877496501717338, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 37.99032974243164, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8636373281478882, "num_tokens": 386306936.0, "step": 10123 }, { "epoch": 1.2878768604503243, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.86347579956055, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8595232963562012, "num_tokens": 386342260.0, "step": 10124 }, { "epoch": 1.2880040707289149, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.07029724121094, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8551880717277527, "num_tokens": 386381784.0, "step": 10125 }, { "epoch": 1.2881312810075054, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.6405029296875, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8573801517486572, "num_tokens": 386422350.0, "step": 10126 }, { "epoch": 1.288258491286096, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.29495620727539, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8534761667251587, "num_tokens": 386467951.0, "step": 10127 }, { "epoch": 1.2883857015646865, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.17338562011719, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8668142557144165, "num_tokens": 386511896.0, "step": 10128 }, { "epoch": 1.288512911843277, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.65999221801758, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8451814651489258, "num_tokens": 386550704.0, "step": 10129 }, { "epoch": 1.2886401221218675, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.282588958740234, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8421128392219543, "num_tokens": 386589295.0, "step": 10130 }, { "epoch": 1.2887673324004578, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.312294006347656, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8585373163223267, "num_tokens": 386623462.0, "step": 10131 }, { "epoch": 1.2888945426790483, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.48332595825195, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.86033034324646, "num_tokens": 386663036.0, "step": 10132 }, { "epoch": 1.2890217529576389, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.17133712768555, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8594669699668884, "num_tokens": 386702530.0, "step": 10133 }, { "epoch": 1.2891489632362294, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.16301727294922, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8724279403686523, "num_tokens": 386739758.0, "step": 10134 }, { "epoch": 1.28927617351482, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.30244445800781, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.867882490158081, "num_tokens": 386778287.0, "step": 10135 }, { "epoch": 1.2894033837934105, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.604637145996094, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8694027662277222, "num_tokens": 386812348.0, "step": 10136 }, { "epoch": 1.289530594072001, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.172035217285156, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8659945130348206, "num_tokens": 386855540.0, "step": 10137 }, { "epoch": 1.2896578043505915, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.625274658203125, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8542321920394897, "num_tokens": 386893775.0, "step": 10138 }, { "epoch": 1.289785014629182, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.14883804321289, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8591809868812561, "num_tokens": 386929553.0, "step": 10139 }, { "epoch": 1.2899122249077726, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.34906768798828, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8508971929550171, "num_tokens": 386968033.0, "step": 10140 }, { "epoch": 1.290039435186363, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.446388244628906, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8577559590339661, "num_tokens": 387004967.0, "step": 10141 }, { "epoch": 1.2901666454649536, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.48778533935547, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8464147448539734, "num_tokens": 387038086.0, "step": 10142 }, { "epoch": 1.2902938557435442, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.64390563964844, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8325423002243042, "num_tokens": 387077112.0, "step": 10143 }, { "epoch": 1.2904210660221347, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.34783935546875, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8407115936279297, "num_tokens": 387118800.0, "step": 10144 }, { "epoch": 1.2905482763007252, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.444541931152344, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8523133993148804, "num_tokens": 387160321.0, "step": 10145 }, { "epoch": 1.2906754865793157, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.46736145019531, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8548237085342407, "num_tokens": 387199387.0, "step": 10146 }, { "epoch": 1.290802696857906, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.70796585083008, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.864129364490509, "num_tokens": 387238859.0, "step": 10147 }, { "epoch": 1.2909299071364966, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.63235092163086, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.86463463306427, "num_tokens": 387272149.0, "step": 10148 }, { "epoch": 1.291057117415087, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.35641098022461, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8559132814407349, "num_tokens": 387311617.0, "step": 10149 }, { "epoch": 1.2911843276936776, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.81593704223633, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.84500652551651, "num_tokens": 387344281.0, "step": 10150 }, { "epoch": 1.2913115379722682, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.38420104980469, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8465448617935181, "num_tokens": 387385959.0, "step": 10151 }, { "epoch": 1.2914387482508587, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.685333251953125, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8533346652984619, "num_tokens": 387426197.0, "step": 10152 }, { "epoch": 1.2915659585294492, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.671051025390625, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.864261269569397, "num_tokens": 387462250.0, "step": 10153 }, { "epoch": 1.2916931688080397, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.505401611328125, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8578045964241028, "num_tokens": 387499903.0, "step": 10154 }, { "epoch": 1.2918203790866303, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.82515335083008, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8670943975448608, "num_tokens": 387535361.0, "step": 10155 }, { "epoch": 1.2919475893652206, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.15491485595703, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8553059697151184, "num_tokens": 387574300.0, "step": 10156 }, { "epoch": 1.2920747996438111, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.78948211669922, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.858993411064148, "num_tokens": 387606860.0, "step": 10157 }, { "epoch": 1.2922020099224016, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.30198287963867, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8458388447761536, "num_tokens": 387646519.0, "step": 10158 }, { "epoch": 1.2923292202009922, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.984676361083984, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8691377639770508, "num_tokens": 387681100.0, "step": 10159 }, { "epoch": 1.2924564304795827, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010395050048828125, "grad_norm": 38.460418701171875, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8800649642944336, "num_tokens": 387715962.0, "step": 10160 }, { "epoch": 1.2925836407581732, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.401702880859375, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8569467663764954, "num_tokens": 387755856.0, "step": 10161 }, { "epoch": 1.2927108510367638, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.666324615478516, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8656963109970093, "num_tokens": 387793256.0, "step": 10162 }, { "epoch": 1.2928380613153543, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.32662582397461, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8525899648666382, "num_tokens": 387834696.0, "step": 10163 }, { "epoch": 1.2929652715939448, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.57084274291992, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.84688800573349, "num_tokens": 387874973.0, "step": 10164 }, { "epoch": 1.2930924818725353, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.432212829589844, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8472331166267395, "num_tokens": 387904887.0, "step": 10165 }, { "epoch": 1.2932196921511259, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.53084945678711, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8654493093490601, "num_tokens": 387944676.0, "step": 10166 }, { "epoch": 1.2933469024297164, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.95395278930664, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8582727909088135, "num_tokens": 387982234.0, "step": 10167 }, { "epoch": 1.293474112708307, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.42177963256836, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8596935868263245, "num_tokens": 388018001.0, "step": 10168 }, { "epoch": 1.2936013229868975, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.7384147644043, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8527626395225525, "num_tokens": 388057476.0, "step": 10169 }, { "epoch": 1.293728533265488, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.35419464111328, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.876996636390686, "num_tokens": 388095351.0, "step": 10170 }, { "epoch": 1.2938557435440783, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.552528381347656, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8591529130935669, "num_tokens": 388135366.0, "step": 10171 }, { "epoch": 1.2939829538226688, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.4071159362793, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8463072776794434, "num_tokens": 388175003.0, "step": 10172 }, { "epoch": 1.2941101641012593, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 39.05222702026367, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.840806245803833, "num_tokens": 388207083.0, "step": 10173 }, { "epoch": 1.2942373743798499, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.80160903930664, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8740785121917725, "num_tokens": 388245038.0, "step": 10174 }, { "epoch": 1.2943645846584404, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.594871520996094, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8560292720794678, "num_tokens": 388285035.0, "step": 10175 }, { "epoch": 1.294491794937031, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 39.08223342895508, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8646570444107056, "num_tokens": 388323971.0, "step": 10176 }, { "epoch": 1.2946190052156215, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.417755126953125, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8539008498191833, "num_tokens": 388362256.0, "step": 10177 }, { "epoch": 1.294746215494212, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.9734992980957, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8793977499008179, "num_tokens": 388401120.0, "step": 10178 }, { "epoch": 1.2948734257728025, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.88278579711914, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8616468906402588, "num_tokens": 388446833.0, "step": 10179 }, { "epoch": 1.2950006360513928, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.36989212036133, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8700959086418152, "num_tokens": 388482266.0, "step": 10180 }, { "epoch": 1.2951278463299833, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.939937591552734, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8593804836273193, "num_tokens": 388518024.0, "step": 10181 }, { "epoch": 1.2952550566085739, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.295570373535156, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8607738614082336, "num_tokens": 388564246.0, "step": 10182 }, { "epoch": 1.2953822668871644, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.69164276123047, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8621272444725037, "num_tokens": 388603556.0, "step": 10183 }, { "epoch": 1.295509477165755, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.3039665222168, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8689576983451843, "num_tokens": 388643615.0, "step": 10184 }, { "epoch": 1.2956366874443455, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.66207504272461, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8633584976196289, "num_tokens": 388682917.0, "step": 10185 }, { "epoch": 1.295763897722936, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.358848571777344, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.861357569694519, "num_tokens": 388720269.0, "step": 10186 }, { "epoch": 1.2958911080015265, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.646705627441406, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8510956168174744, "num_tokens": 388759245.0, "step": 10187 }, { "epoch": 1.296018318280117, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.56694793701172, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8558128476142883, "num_tokens": 388795673.0, "step": 10188 }, { "epoch": 1.2961455285587076, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.27927780151367, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8633694648742676, "num_tokens": 388832926.0, "step": 10189 }, { "epoch": 1.296272738837298, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.480831146240234, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8631163835525513, "num_tokens": 388869767.0, "step": 10190 }, { "epoch": 1.2963999491158886, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 37.969383239746094, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8610484600067139, "num_tokens": 388913347.0, "step": 10191 }, { "epoch": 1.2965271593944792, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.47732925415039, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8716905117034912, "num_tokens": 388948211.0, "step": 10192 }, { "epoch": 1.2966543696730697, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.0947151184082, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8644745349884033, "num_tokens": 388989198.0, "step": 10193 }, { "epoch": 1.2967815799516602, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.91298294067383, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8502185344696045, "num_tokens": 389029076.0, "step": 10194 }, { "epoch": 1.2969087902302507, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.18726348876953, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8600460290908813, "num_tokens": 389066751.0, "step": 10195 }, { "epoch": 1.297036000508841, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.813316345214844, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8571805953979492, "num_tokens": 389104207.0, "step": 10196 }, { "epoch": 1.2971632107874316, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.26435470581055, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8623360991477966, "num_tokens": 389141941.0, "step": 10197 }, { "epoch": 1.297290421066022, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.89609146118164, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8708281517028809, "num_tokens": 389185326.0, "step": 10198 }, { "epoch": 1.2974176313446126, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.080055236816406, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8697726726531982, "num_tokens": 389221512.0, "step": 10199 }, { "epoch": 1.2975448416232032, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.89113998413086, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8663343787193298, "num_tokens": 389259388.0, "step": 10200 }, { "epoch": 1.2976720519017937, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.54258728027344, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8484314680099487, "num_tokens": 389298773.0, "step": 10201 }, { "epoch": 1.2977992621803842, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.50685501098633, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8612436652183533, "num_tokens": 389339680.0, "step": 10202 }, { "epoch": 1.2979264724589747, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.37881088256836, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8631281852722168, "num_tokens": 389380755.0, "step": 10203 }, { "epoch": 1.2980536827375653, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.857723236083984, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8620615005493164, "num_tokens": 389412031.0, "step": 10204 }, { "epoch": 1.2981808930161556, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.635009765625, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8701092600822449, "num_tokens": 389448299.0, "step": 10205 }, { "epoch": 1.298308103294746, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.71076583862305, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8531585335731506, "num_tokens": 389483962.0, "step": 10206 }, { "epoch": 1.2984353135733366, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.488224029541016, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8518762588500977, "num_tokens": 389521989.0, "step": 10207 }, { "epoch": 1.2985625238519272, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.66078186035156, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8545533418655396, "num_tokens": 389558228.0, "step": 10208 }, { "epoch": 1.2986897341305177, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.866634368896484, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8563663959503174, "num_tokens": 389597164.0, "step": 10209 }, { "epoch": 1.2988169444091082, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.693294525146484, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8587470650672913, "num_tokens": 389636435.0, "step": 10210 }, { "epoch": 1.2989441546876987, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.50695037841797, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8500908613204956, "num_tokens": 389670498.0, "step": 10211 }, { "epoch": 1.2990713649662893, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.55908203125, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8606160879135132, "num_tokens": 389707433.0, "step": 10212 }, { "epoch": 1.2991985752448798, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.82136917114258, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8602961897850037, "num_tokens": 389747300.0, "step": 10213 }, { "epoch": 1.2993257855234703, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.6506462097168, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8537268042564392, "num_tokens": 389788381.0, "step": 10214 }, { "epoch": 1.2994529958020609, "ewc_loss": 0.125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.63793182373047, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8611240386962891, "num_tokens": 389821792.0, "step": 10215 }, { "epoch": 1.2995802060806514, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.32625198364258, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8717186450958252, "num_tokens": 389859370.0, "step": 10216 }, { "epoch": 1.299707416359242, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.79929733276367, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8505733013153076, "num_tokens": 389897020.0, "step": 10217 }, { "epoch": 1.2998346266378324, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.023475646972656, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8339725732803345, "num_tokens": 389927107.0, "step": 10218 }, { "epoch": 1.299961836916423, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.7900390625, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8571774959564209, "num_tokens": 389966323.0, "step": 10219 }, { "epoch": 1.3000890471950133, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.595603942871094, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8579656481742859, "num_tokens": 390001416.0, "step": 10220 }, { "epoch": 1.3002162574736038, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.951026916503906, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8616898655891418, "num_tokens": 390042326.0, "step": 10221 }, { "epoch": 1.3003434677521943, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.39350128173828, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8493722677230835, "num_tokens": 390084443.0, "step": 10222 }, { "epoch": 1.3004706780307849, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.8338737487793, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8536931276321411, "num_tokens": 390126309.0, "step": 10223 }, { "epoch": 1.3005978883093754, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.806514739990234, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8643382787704468, "num_tokens": 390161463.0, "step": 10224 }, { "epoch": 1.300725098587966, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.744964599609375, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8689187169075012, "num_tokens": 390203221.0, "step": 10225 }, { "epoch": 1.3008523088665565, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.769805908203125, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8759797811508179, "num_tokens": 390243269.0, "step": 10226 }, { "epoch": 1.300979519145147, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.37681579589844, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8600288033485413, "num_tokens": 390282488.0, "step": 10227 }, { "epoch": 1.3011067294237375, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.87538146972656, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8552845120429993, "num_tokens": 390320268.0, "step": 10228 }, { "epoch": 1.3012339397023278, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.2916145324707, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8489172458648682, "num_tokens": 390354733.0, "step": 10229 }, { "epoch": 1.3013611499809183, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.34115982055664, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8583095669746399, "num_tokens": 390388161.0, "step": 10230 }, { "epoch": 1.3014883602595089, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 37.953861236572266, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8696678876876831, "num_tokens": 390424609.0, "step": 10231 }, { "epoch": 1.3016155705380994, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.545047760009766, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8757606744766235, "num_tokens": 390460809.0, "step": 10232 }, { "epoch": 1.30174278081669, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 38.019996643066406, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8493654131889343, "num_tokens": 390500198.0, "step": 10233 }, { "epoch": 1.3018699910952805, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.219764709472656, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8527418375015259, "num_tokens": 390545588.0, "step": 10234 }, { "epoch": 1.301997201373871, "ewc_loss": 0.1240234375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010442733764648438, "grad_norm": 37.983577728271484, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.850197434425354, "num_tokens": 390581681.0, "step": 10235 }, { "epoch": 1.3021244116524615, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.87255859375, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8760294914245605, "num_tokens": 390619122.0, "step": 10236 }, { "epoch": 1.302251621931052, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.26332092285156, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.846004843711853, "num_tokens": 390667974.0, "step": 10237 }, { "epoch": 1.3023788322096426, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.531578063964844, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8620509505271912, "num_tokens": 390705188.0, "step": 10238 }, { "epoch": 1.302506042488233, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.748497009277344, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8694776296615601, "num_tokens": 390746892.0, "step": 10239 }, { "epoch": 1.3026332527668236, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.48507308959961, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.874447762966156, "num_tokens": 390787727.0, "step": 10240 }, { "epoch": 1.3027604630454142, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.80534744262695, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8521372079849243, "num_tokens": 390823702.0, "step": 10241 }, { "epoch": 1.3028876733240047, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.28828048706055, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8582844734191895, "num_tokens": 390860256.0, "step": 10242 }, { "epoch": 1.3030148836025952, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.55956268310547, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8713375329971313, "num_tokens": 390901676.0, "step": 10243 }, { "epoch": 1.3031420938811857, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.19414138793945, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8409627676010132, "num_tokens": 390936911.0, "step": 10244 }, { "epoch": 1.303269304159776, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.147457122802734, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8476731777191162, "num_tokens": 390976185.0, "step": 10245 }, { "epoch": 1.3033965144383666, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.53717041015625, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8596237897872925, "num_tokens": 391014661.0, "step": 10246 }, { "epoch": 1.303523724716957, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.4111213684082, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8509688377380371, "num_tokens": 391056205.0, "step": 10247 }, { "epoch": 1.3036509349955476, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.43586730957031, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8602133393287659, "num_tokens": 391092187.0, "step": 10248 }, { "epoch": 1.3037781452741382, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.51753616333008, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8746996521949768, "num_tokens": 391130890.0, "step": 10249 }, { "epoch": 1.3039053555527287, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.39879608154297, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8693385124206543, "num_tokens": 391172783.0, "step": 10250 }, { "epoch": 1.3040325658313192, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.76390075683594, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8415242433547974, "num_tokens": 391215031.0, "step": 10251 }, { "epoch": 1.3041597761099097, "ewc_loss": 0.125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.246490478515625, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8627738356590271, "num_tokens": 391251400.0, "step": 10252 }, { "epoch": 1.3042869863885003, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.862056732177734, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8505772352218628, "num_tokens": 391285895.0, "step": 10253 }, { "epoch": 1.3044141966670906, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.71363067626953, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8615388870239258, "num_tokens": 391328361.0, "step": 10254 }, { "epoch": 1.304541406945681, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.6199836730957, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8581200838088989, "num_tokens": 391365019.0, "step": 10255 }, { "epoch": 1.3046686172242716, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.83601760864258, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8439210057258606, "num_tokens": 391400407.0, "step": 10256 }, { "epoch": 1.3047958275028622, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.482879638671875, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8430505394935608, "num_tokens": 391440660.0, "step": 10257 }, { "epoch": 1.3049230377814527, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.853878021240234, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8622086048126221, "num_tokens": 391481096.0, "step": 10258 }, { "epoch": 1.3050502480600432, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.58634948730469, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8593780994415283, "num_tokens": 391516356.0, "step": 10259 }, { "epoch": 1.3051774583386337, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.828582763671875, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8478675484657288, "num_tokens": 391554691.0, "step": 10260 }, { "epoch": 1.3053046686172243, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.61469650268555, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8454679846763611, "num_tokens": 391592473.0, "step": 10261 }, { "epoch": 1.3054318788958148, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.84842300415039, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.864388108253479, "num_tokens": 391632091.0, "step": 10262 }, { "epoch": 1.3055590891744053, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.50166702270508, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8528823852539062, "num_tokens": 391676121.0, "step": 10263 }, { "epoch": 1.3056862994529959, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.81982421875, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8568568229675293, "num_tokens": 391709664.0, "step": 10264 }, { "epoch": 1.3058135097315864, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 37.912784576416016, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8479486107826233, "num_tokens": 391755828.0, "step": 10265 }, { "epoch": 1.305940720010177, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.996826171875, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8765626549720764, "num_tokens": 391800232.0, "step": 10266 }, { "epoch": 1.3060679302887674, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 37.895172119140625, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8606740236282349, "num_tokens": 391837612.0, "step": 10267 }, { "epoch": 1.306195140567358, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.23735427856445, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8702152967453003, "num_tokens": 391871193.0, "step": 10268 }, { "epoch": 1.3063223508459483, "ewc_loss": 0.12451171875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.408668518066406, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8516011238098145, "num_tokens": 391908203.0, "step": 10269 }, { "epoch": 1.3064495611245388, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.0483283996582, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8634886741638184, "num_tokens": 391938744.0, "step": 10270 }, { "epoch": 1.3065767714031293, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.20592498779297, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8620208501815796, "num_tokens": 391976020.0, "step": 10271 }, { "epoch": 1.3067039816817199, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.9932746887207, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.850551187992096, "num_tokens": 392018006.0, "step": 10272 }, { "epoch": 1.3068311919603104, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 37.958045959472656, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8560002446174622, "num_tokens": 392055358.0, "step": 10273 }, { "epoch": 1.306958402238901, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.921653747558594, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8569414615631104, "num_tokens": 392097727.0, "step": 10274 }, { "epoch": 1.3070856125174914, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.31459045410156, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8647496104240417, "num_tokens": 392144641.0, "step": 10275 }, { "epoch": 1.307212822796082, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.130767822265625, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8669686913490295, "num_tokens": 392178810.0, "step": 10276 }, { "epoch": 1.3073400330746725, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.315330505371094, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8539015054702759, "num_tokens": 392213571.0, "step": 10277 }, { "epoch": 1.3074672433532628, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.81876754760742, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8563777208328247, "num_tokens": 392254628.0, "step": 10278 }, { "epoch": 1.3075944536318533, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.80679702758789, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8570296764373779, "num_tokens": 392297243.0, "step": 10279 }, { "epoch": 1.3077216639104439, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.455440521240234, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8611316680908203, "num_tokens": 392329116.0, "step": 10280 }, { "epoch": 1.3078488741890344, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.706138610839844, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8555917739868164, "num_tokens": 392370542.0, "step": 10281 }, { "epoch": 1.307976084467625, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.842926025390625, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8646462559700012, "num_tokens": 392411155.0, "step": 10282 }, { "epoch": 1.3081032947462155, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.69502258300781, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.859187126159668, "num_tokens": 392450440.0, "step": 10283 }, { "epoch": 1.308230505024806, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.634239196777344, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8690382242202759, "num_tokens": 392488545.0, "step": 10284 }, { "epoch": 1.3083577153033965, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.859703063964844, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8592019081115723, "num_tokens": 392527138.0, "step": 10285 }, { "epoch": 1.308484925581987, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.59412384033203, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8588482141494751, "num_tokens": 392568181.0, "step": 10286 }, { "epoch": 1.3086121358605776, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.41847229003906, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8735913634300232, "num_tokens": 392610900.0, "step": 10287 }, { "epoch": 1.308739346139168, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.850547790527344, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8514014482498169, "num_tokens": 392646624.0, "step": 10288 }, { "epoch": 1.3088665564177586, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.77779769897461, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8580902814865112, "num_tokens": 392683103.0, "step": 10289 }, { "epoch": 1.3089937666963491, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.49378967285156, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8598917722702026, "num_tokens": 392723169.0, "step": 10290 }, { "epoch": 1.3091209769749397, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.783203125, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8739292621612549, "num_tokens": 392762323.0, "step": 10291 }, { "epoch": 1.3092481872535302, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.48033142089844, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8758620023727417, "num_tokens": 392797555.0, "step": 10292 }, { "epoch": 1.3093753975321207, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.78535461425781, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8700243830680847, "num_tokens": 392840001.0, "step": 10293 }, { "epoch": 1.309502607810711, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.54443359375, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8590421676635742, "num_tokens": 392880871.0, "step": 10294 }, { "epoch": 1.3096298180893016, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.90591049194336, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.868594229221344, "num_tokens": 392917554.0, "step": 10295 }, { "epoch": 1.309757028367892, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.314414978027344, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8774915933609009, "num_tokens": 392958601.0, "step": 10296 }, { "epoch": 1.3098842386464826, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.978031158447266, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8607021570205688, "num_tokens": 393000108.0, "step": 10297 }, { "epoch": 1.3100114489250732, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.317657470703125, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8802280426025391, "num_tokens": 393032321.0, "step": 10298 }, { "epoch": 1.3101386592036637, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.8476676940918, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8577044010162354, "num_tokens": 393068347.0, "step": 10299 }, { "epoch": 1.3102658694822542, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.75499725341797, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8765696883201599, "num_tokens": 393102509.0, "step": 10300 }, { "epoch": 1.3103930797608447, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.64112854003906, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.869188666343689, "num_tokens": 393140974.0, "step": 10301 }, { "epoch": 1.3105202900394353, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.84949493408203, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8698922991752625, "num_tokens": 393177214.0, "step": 10302 }, { "epoch": 1.3106475003180256, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.678794860839844, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8626682758331299, "num_tokens": 393215801.0, "step": 10303 }, { "epoch": 1.310774710596616, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.94929885864258, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8710982799530029, "num_tokens": 393256444.0, "step": 10304 }, { "epoch": 1.3109019208752066, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.77599334716797, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8562002778053284, "num_tokens": 393300238.0, "step": 10305 }, { "epoch": 1.3110291311537972, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.946556091308594, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.845632791519165, "num_tokens": 393344087.0, "step": 10306 }, { "epoch": 1.3111563414323877, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.11653518676758, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8648696541786194, "num_tokens": 393381678.0, "step": 10307 }, { "epoch": 1.3112835517109782, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.580265045166016, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8586937189102173, "num_tokens": 393415740.0, "step": 10308 }, { "epoch": 1.3114107619895687, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.14212417602539, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8355545997619629, "num_tokens": 393457692.0, "step": 10309 }, { "epoch": 1.3115379722681593, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.75216293334961, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.855268657207489, "num_tokens": 393499003.0, "step": 10310 }, { "epoch": 1.3116651825467498, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.10814666748047, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8485392332077026, "num_tokens": 393533842.0, "step": 10311 }, { "epoch": 1.3117923928253403, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.73453140258789, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8684683442115784, "num_tokens": 393573906.0, "step": 10312 }, { "epoch": 1.3119196031039309, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.985931396484375, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8585928082466125, "num_tokens": 393615298.0, "step": 10313 }, { "epoch": 1.3120468133825214, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.607791900634766, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8490682244300842, "num_tokens": 393654690.0, "step": 10314 }, { "epoch": 1.312174023661112, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.141178131103516, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8607730269432068, "num_tokens": 393696222.0, "step": 10315 }, { "epoch": 1.3123012339397024, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.65629577636719, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8567995429039001, "num_tokens": 393736748.0, "step": 10316 }, { "epoch": 1.312428444218293, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.71189498901367, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.851194441318512, "num_tokens": 393773077.0, "step": 10317 }, { "epoch": 1.3125556544968833, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.96800994873047, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8708146810531616, "num_tokens": 393806466.0, "step": 10318 }, { "epoch": 1.3126828647754738, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.926918029785156, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8556050062179565, "num_tokens": 393840922.0, "step": 10319 }, { "epoch": 1.3128100750540643, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.912025451660156, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8633725643157959, "num_tokens": 393874544.0, "step": 10320 }, { "epoch": 1.3129372853326549, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.512107849121094, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.854630708694458, "num_tokens": 393917773.0, "step": 10321 }, { "epoch": 1.3130644956112454, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.95261764526367, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8548330664634705, "num_tokens": 393951525.0, "step": 10322 }, { "epoch": 1.313191705889836, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.60280990600586, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8635295033454895, "num_tokens": 393989796.0, "step": 10323 }, { "epoch": 1.3133189161684264, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.15781784057617, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8573424816131592, "num_tokens": 394028746.0, "step": 10324 }, { "epoch": 1.313446126447017, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.53059768676758, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8679236769676208, "num_tokens": 394065241.0, "step": 10325 }, { "epoch": 1.3135733367256075, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.48963928222656, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8540766835212708, "num_tokens": 394110113.0, "step": 10326 }, { "epoch": 1.3137005470041978, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.730255126953125, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8425005674362183, "num_tokens": 394146577.0, "step": 10327 }, { "epoch": 1.3138277572827883, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.50741958618164, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8598451614379883, "num_tokens": 394185392.0, "step": 10328 }, { "epoch": 1.3139549675613789, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 37.936065673828125, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8494974374771118, "num_tokens": 394226301.0, "step": 10329 }, { "epoch": 1.3140821778399694, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.342376708984375, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8504441976547241, "num_tokens": 394262654.0, "step": 10330 }, { "epoch": 1.31420938811856, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.43588638305664, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8711128234863281, "num_tokens": 394299185.0, "step": 10331 }, { "epoch": 1.3143365983971504, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.00819778442383, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8609932661056519, "num_tokens": 394331364.0, "step": 10332 }, { "epoch": 1.314463808675741, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.591773986816406, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8477110862731934, "num_tokens": 394375479.0, "step": 10333 }, { "epoch": 1.3145910189543315, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.0950813293457, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8650923371315002, "num_tokens": 394414913.0, "step": 10334 }, { "epoch": 1.314718229232922, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.75772476196289, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.859564483165741, "num_tokens": 394450889.0, "step": 10335 }, { "epoch": 1.3148454395115126, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.70413589477539, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8435001373291016, "num_tokens": 394495280.0, "step": 10336 }, { "epoch": 1.314972649790103, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.46799087524414, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8684481382369995, "num_tokens": 394536811.0, "step": 10337 }, { "epoch": 1.3150998600686936, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.542083740234375, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8569398522377014, "num_tokens": 394573353.0, "step": 10338 }, { "epoch": 1.3152270703472841, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.01662826538086, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8510068655014038, "num_tokens": 394603517.0, "step": 10339 }, { "epoch": 1.3153542806258747, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.48490905761719, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8658395409584045, "num_tokens": 394650547.0, "step": 10340 }, { "epoch": 1.3154814909044652, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.0892448425293, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8635720014572144, "num_tokens": 394684460.0, "step": 10341 }, { "epoch": 1.3156087011830557, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.601318359375, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8763039708137512, "num_tokens": 394733520.0, "step": 10342 }, { "epoch": 1.315735911461646, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.17955017089844, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8538147211074829, "num_tokens": 394775498.0, "step": 10343 }, { "epoch": 1.3158631217402366, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.87871170043945, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8454709053039551, "num_tokens": 394815138.0, "step": 10344 }, { "epoch": 1.315990332018827, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.90982437133789, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8515346050262451, "num_tokens": 394850447.0, "step": 10345 }, { "epoch": 1.3161175422974176, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 39.135494232177734, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8531296253204346, "num_tokens": 394891800.0, "step": 10346 }, { "epoch": 1.3162447525760081, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.69462966918945, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8652225732803345, "num_tokens": 394930087.0, "step": 10347 }, { "epoch": 1.3163719628545987, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.99673843383789, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8694825172424316, "num_tokens": 394964497.0, "step": 10348 }, { "epoch": 1.3164991731331892, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.800201416015625, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8414840698242188, "num_tokens": 395003127.0, "step": 10349 }, { "epoch": 1.3166263834117797, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.755592346191406, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8494107127189636, "num_tokens": 395040570.0, "step": 10350 }, { "epoch": 1.3167535936903703, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.811607360839844, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8600473999977112, "num_tokens": 395075205.0, "step": 10351 }, { "epoch": 1.3168808039689606, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.70886993408203, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8587274551391602, "num_tokens": 395117492.0, "step": 10352 }, { "epoch": 1.317008014247551, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.1417350769043, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8573461174964905, "num_tokens": 395162449.0, "step": 10353 }, { "epoch": 1.3171352245261416, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.02079772949219, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8667995929718018, "num_tokens": 395197310.0, "step": 10354 }, { "epoch": 1.3172624348047322, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.61219024658203, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8693962097167969, "num_tokens": 395232420.0, "step": 10355 }, { "epoch": 1.3173896450833227, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.608516693115234, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8368843197822571, "num_tokens": 395269392.0, "step": 10356 }, { "epoch": 1.3175168553619132, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.1085205078125, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8467762470245361, "num_tokens": 395305496.0, "step": 10357 }, { "epoch": 1.3176440656405037, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.52216720581055, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8688236474990845, "num_tokens": 395347089.0, "step": 10358 }, { "epoch": 1.3177712759190943, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.626220703125, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8708068132400513, "num_tokens": 395382885.0, "step": 10359 }, { "epoch": 1.3178984861976848, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.099788665771484, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8724637031555176, "num_tokens": 395416260.0, "step": 10360 }, { "epoch": 1.3180256964762753, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.49654006958008, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8508294820785522, "num_tokens": 395454179.0, "step": 10361 }, { "epoch": 1.3181529067548658, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.90040588378906, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8505351543426514, "num_tokens": 395489173.0, "step": 10362 }, { "epoch": 1.3182801170334564, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.96754455566406, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8546085953712463, "num_tokens": 395532057.0, "step": 10363 }, { "epoch": 1.318407327312047, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.864356994628906, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8667011857032776, "num_tokens": 395570321.0, "step": 10364 }, { "epoch": 1.3185345375906374, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.04829406738281, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8802527189254761, "num_tokens": 395609041.0, "step": 10365 }, { "epoch": 1.318661747869228, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.53444290161133, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8358048796653748, "num_tokens": 395649961.0, "step": 10366 }, { "epoch": 1.3187889581478183, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.83181381225586, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8609207272529602, "num_tokens": 395691882.0, "step": 10367 }, { "epoch": 1.3189161684264088, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.7401123046875, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8472321033477783, "num_tokens": 395731496.0, "step": 10368 }, { "epoch": 1.3190433787049993, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.719268798828125, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8640657067298889, "num_tokens": 395772618.0, "step": 10369 }, { "epoch": 1.3191705889835899, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.98264694213867, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8425089716911316, "num_tokens": 395811907.0, "step": 10370 }, { "epoch": 1.3192977992621804, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.866268157958984, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8526527881622314, "num_tokens": 395851186.0, "step": 10371 }, { "epoch": 1.319425009540771, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.76991271972656, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8587636947631836, "num_tokens": 395893868.0, "step": 10372 }, { "epoch": 1.3195522198193614, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.613250732421875, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8691496849060059, "num_tokens": 395930519.0, "step": 10373 }, { "epoch": 1.319679430097952, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.09265899658203, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8744339942932129, "num_tokens": 395965692.0, "step": 10374 }, { "epoch": 1.3198066403765425, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.79813003540039, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8597456216812134, "num_tokens": 396002316.0, "step": 10375 }, { "epoch": 1.3199338506551328, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.94519805908203, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8662595748901367, "num_tokens": 396043392.0, "step": 10376 }, { "epoch": 1.3200610609337233, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.66128921508789, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8575423359870911, "num_tokens": 396078603.0, "step": 10377 }, { "epoch": 1.3201882712123139, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.17240905761719, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8493907451629639, "num_tokens": 396121132.0, "step": 10378 }, { "epoch": 1.3203154814909044, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.561580657958984, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8687766790390015, "num_tokens": 396155730.0, "step": 10379 }, { "epoch": 1.320442691769495, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.2724609375, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8457825183868408, "num_tokens": 396190976.0, "step": 10380 }, { "epoch": 1.3205699020480854, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.80939865112305, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8593395352363586, "num_tokens": 396229571.0, "step": 10381 }, { "epoch": 1.320697112326676, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.768741607666016, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8594004511833191, "num_tokens": 396265586.0, "step": 10382 }, { "epoch": 1.3208243226052665, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.496456146240234, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8639191389083862, "num_tokens": 396303810.0, "step": 10383 }, { "epoch": 1.320951532883857, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.71510314941406, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8782145977020264, "num_tokens": 396345671.0, "step": 10384 }, { "epoch": 1.3210787431624476, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.27360534667969, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8765784502029419, "num_tokens": 396388062.0, "step": 10385 }, { "epoch": 1.321205953441038, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.556671142578125, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.871074914932251, "num_tokens": 396425189.0, "step": 10386 }, { "epoch": 1.3213331637196286, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.46470260620117, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8492037057876587, "num_tokens": 396461861.0, "step": 10387 }, { "epoch": 1.3214603739982191, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.27363204956055, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8708022832870483, "num_tokens": 396506190.0, "step": 10388 }, { "epoch": 1.3215875842768097, "ewc_loss": 0.125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.64762878417969, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8614693880081177, "num_tokens": 396538900.0, "step": 10389 }, { "epoch": 1.3217147945554002, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.97376251220703, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8680647015571594, "num_tokens": 396577029.0, "step": 10390 }, { "epoch": 1.3218420048339907, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.92877197265625, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8439204096794128, "num_tokens": 396616917.0, "step": 10391 }, { "epoch": 1.321969215112581, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.969642639160156, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8683987855911255, "num_tokens": 396651726.0, "step": 10392 }, { "epoch": 1.3220964253911716, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.35203170776367, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8479936122894287, "num_tokens": 396693796.0, "step": 10393 }, { "epoch": 1.322223635669762, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.7402229309082, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8487933874130249, "num_tokens": 396731947.0, "step": 10394 }, { "epoch": 1.3223508459483526, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.756290435791016, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8636500239372253, "num_tokens": 396767062.0, "step": 10395 }, { "epoch": 1.3224780562269431, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.87331771850586, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8638346195220947, "num_tokens": 396803918.0, "step": 10396 }, { "epoch": 1.3226052665055337, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.621734619140625, "learning_rate": 1e-06, "loss": 0.6348, "mean_token_accuracy": 0.8334773778915405, "num_tokens": 396850314.0, "step": 10397 }, { "epoch": 1.3227324767841242, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.8365478515625, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8524916172027588, "num_tokens": 396887920.0, "step": 10398 }, { "epoch": 1.3228596870627147, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.44833755493164, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8507182598114014, "num_tokens": 396923369.0, "step": 10399 }, { "epoch": 1.3229868973413053, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.49113464355469, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8576674461364746, "num_tokens": 396962338.0, "step": 10400 }, { "epoch": 1.3231141076198956, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.725101470947266, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8688032031059265, "num_tokens": 396998275.0, "step": 10401 }, { "epoch": 1.323241317898486, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.70473861694336, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8563207983970642, "num_tokens": 397040034.0, "step": 10402 }, { "epoch": 1.3233685281770766, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.315574645996094, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8789278268814087, "num_tokens": 397080549.0, "step": 10403 }, { "epoch": 1.3234957384556671, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.655113220214844, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.87257981300354, "num_tokens": 397120803.0, "step": 10404 }, { "epoch": 1.3236229487342577, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.284637451171875, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8639542460441589, "num_tokens": 397160463.0, "step": 10405 }, { "epoch": 1.3237501590128482, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.82119369506836, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8530690670013428, "num_tokens": 397205425.0, "step": 10406 }, { "epoch": 1.3238773692914387, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.01848602294922, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8537465929985046, "num_tokens": 397238907.0, "step": 10407 }, { "epoch": 1.3240045795700293, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.921775817871094, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8604912161827087, "num_tokens": 397281639.0, "step": 10408 }, { "epoch": 1.3241317898486198, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.785133361816406, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8480435609817505, "num_tokens": 397323350.0, "step": 10409 }, { "epoch": 1.3242590001272103, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.154541015625, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8538123965263367, "num_tokens": 397364438.0, "step": 10410 }, { "epoch": 1.3243862104058008, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.6473274230957, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8466122150421143, "num_tokens": 397406881.0, "step": 10411 }, { "epoch": 1.3245134206843914, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.36003494262695, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8622009754180908, "num_tokens": 397451190.0, "step": 10412 }, { "epoch": 1.324640630962982, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.55805206298828, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8605889081954956, "num_tokens": 397491063.0, "step": 10413 }, { "epoch": 1.3247678412415724, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.520301818847656, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8592336773872375, "num_tokens": 397529961.0, "step": 10414 }, { "epoch": 1.324895051520163, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.37327194213867, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.869924008846283, "num_tokens": 397565616.0, "step": 10415 }, { "epoch": 1.3250222617987533, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.51084518432617, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8657586574554443, "num_tokens": 397602223.0, "step": 10416 }, { "epoch": 1.3251494720773438, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.40481948852539, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8378368616104126, "num_tokens": 397638910.0, "step": 10417 }, { "epoch": 1.3252766823559343, "ewc_loss": 0.130859375, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.38991165161133, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8513457775115967, "num_tokens": 397681200.0, "step": 10418 }, { "epoch": 1.3254038926345248, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.955081939697266, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8612180948257446, "num_tokens": 397724832.0, "step": 10419 }, { "epoch": 1.3255311029131154, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.137306213378906, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8520674705505371, "num_tokens": 397765537.0, "step": 10420 }, { "epoch": 1.325658313191706, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.9822883605957, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8511829376220703, "num_tokens": 397800945.0, "step": 10421 }, { "epoch": 1.3257855234702964, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.02229309082031, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8652624487876892, "num_tokens": 397840786.0, "step": 10422 }, { "epoch": 1.325912733748887, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.163726806640625, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8635256886482239, "num_tokens": 397875607.0, "step": 10423 }, { "epoch": 1.3260399440274775, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.77131271362305, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8742902874946594, "num_tokens": 397909524.0, "step": 10424 }, { "epoch": 1.3261671543060678, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.38498306274414, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.843225359916687, "num_tokens": 397947862.0, "step": 10425 }, { "epoch": 1.3262943645846583, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.765201568603516, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8748058080673218, "num_tokens": 397980958.0, "step": 10426 }, { "epoch": 1.3264215748632489, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.23609161376953, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8566802740097046, "num_tokens": 398021231.0, "step": 10427 }, { "epoch": 1.3265487851418394, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.96052551269531, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.85721755027771, "num_tokens": 398059202.0, "step": 10428 }, { "epoch": 1.32667599542043, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.76129150390625, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8804018497467041, "num_tokens": 398097121.0, "step": 10429 }, { "epoch": 1.3268032056990204, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.599239349365234, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8598095774650574, "num_tokens": 398130413.0, "step": 10430 }, { "epoch": 1.326930415977611, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.237998962402344, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8592872023582458, "num_tokens": 398172535.0, "step": 10431 }, { "epoch": 1.3270576262562015, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.68366622924805, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8609298467636108, "num_tokens": 398206681.0, "step": 10432 }, { "epoch": 1.327184836534792, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 38.688602447509766, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8633303642272949, "num_tokens": 398245206.0, "step": 10433 }, { "epoch": 1.3273120468133826, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.65196990966797, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8559617400169373, "num_tokens": 398278902.0, "step": 10434 }, { "epoch": 1.327439257091973, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.6773796081543, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8601305484771729, "num_tokens": 398315833.0, "step": 10435 }, { "epoch": 1.3275664673705636, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.272621154785156, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8420642018318176, "num_tokens": 398356496.0, "step": 10436 }, { "epoch": 1.3276936776491541, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.023712158203125, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8661414980888367, "num_tokens": 398393661.0, "step": 10437 }, { "epoch": 1.3278208879277447, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.895015716552734, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8766475915908813, "num_tokens": 398430463.0, "step": 10438 }, { "epoch": 1.3279480982063352, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.18864440917969, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8577312231063843, "num_tokens": 398470382.0, "step": 10439 }, { "epoch": 1.3280753084849257, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.737945556640625, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8430030345916748, "num_tokens": 398504793.0, "step": 10440 }, { "epoch": 1.328202518763516, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.25358963012695, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8500256538391113, "num_tokens": 398541038.0, "step": 10441 }, { "epoch": 1.3283297290421066, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.864234924316406, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8446592688560486, "num_tokens": 398578047.0, "step": 10442 }, { "epoch": 1.328456939320697, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.284549713134766, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8476061820983887, "num_tokens": 398618946.0, "step": 10443 }, { "epoch": 1.3285841495992876, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.82415771484375, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8644276857376099, "num_tokens": 398656895.0, "step": 10444 }, { "epoch": 1.3287113598778781, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.77422332763672, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8594179749488831, "num_tokens": 398696521.0, "step": 10445 }, { "epoch": 1.3288385701564687, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 38.97520065307617, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8611259460449219, "num_tokens": 398735275.0, "step": 10446 }, { "epoch": 1.3289657804350592, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.51298141479492, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8445501327514648, "num_tokens": 398774753.0, "step": 10447 }, { "epoch": 1.3290929907136497, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.18769454956055, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8532071709632874, "num_tokens": 398810893.0, "step": 10448 }, { "epoch": 1.3292202009922403, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.487403869628906, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8538569211959839, "num_tokens": 398849741.0, "step": 10449 }, { "epoch": 1.3293474112708306, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.26258087158203, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8538055419921875, "num_tokens": 398881186.0, "step": 10450 }, { "epoch": 1.329474621549421, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.60302734375, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8759323358535767, "num_tokens": 398918677.0, "step": 10451 }, { "epoch": 1.3296018318280116, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.08753204345703, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8569211959838867, "num_tokens": 398953435.0, "step": 10452 }, { "epoch": 1.3297290421066021, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.107994079589844, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8657587170600891, "num_tokens": 398991799.0, "step": 10453 }, { "epoch": 1.3298562523851927, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.67854690551758, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8633980751037598, "num_tokens": 399023211.0, "step": 10454 }, { "epoch": 1.3299834626637832, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.239341735839844, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.85563063621521, "num_tokens": 399061431.0, "step": 10455 }, { "epoch": 1.3301106729423737, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.9788742065429688e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.55266189575195, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8614664077758789, "num_tokens": 399097384.0, "step": 10456 }, { "epoch": 1.3302378832209643, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.047157287597656, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8619910478591919, "num_tokens": 399133935.0, "step": 10457 }, { "epoch": 1.3303650934995548, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.95121765136719, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8652156591415405, "num_tokens": 399165709.0, "step": 10458 }, { "epoch": 1.3304923037781453, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.73459243774414, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8568054437637329, "num_tokens": 399203548.0, "step": 10459 }, { "epoch": 1.3306195140567358, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.15012741088867, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8580935597419739, "num_tokens": 399244341.0, "step": 10460 }, { "epoch": 1.3307467243353264, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.36445617675781, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8647351861000061, "num_tokens": 399282284.0, "step": 10461 }, { "epoch": 1.330873934613917, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.508426666259766, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8738049268722534, "num_tokens": 399320523.0, "step": 10462 }, { "epoch": 1.3310011448925074, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.292964935302734, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8588725924491882, "num_tokens": 399357489.0, "step": 10463 }, { "epoch": 1.331128355171098, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.37356948852539, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8471508026123047, "num_tokens": 399393561.0, "step": 10464 }, { "epoch": 1.3312555654496883, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.71619415283203, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8642047643661499, "num_tokens": 399424514.0, "step": 10465 }, { "epoch": 1.3313827757282788, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.2620964050293, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8654430508613586, "num_tokens": 399461372.0, "step": 10466 }, { "epoch": 1.3315099860068693, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.03558349609375, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8749734163284302, "num_tokens": 399502503.0, "step": 10467 }, { "epoch": 1.3316371962854598, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.03965759277344, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8521610498428345, "num_tokens": 399537836.0, "step": 10468 }, { "epoch": 1.3317644065640504, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.081214904785156, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8542568683624268, "num_tokens": 399575189.0, "step": 10469 }, { "epoch": 1.331891616842641, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.91606140136719, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.840316653251648, "num_tokens": 399614406.0, "step": 10470 }, { "epoch": 1.3320188271212314, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.92246627807617, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8658922910690308, "num_tokens": 399652847.0, "step": 10471 }, { "epoch": 1.332146037399822, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.84334945678711, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8499008417129517, "num_tokens": 399691628.0, "step": 10472 }, { "epoch": 1.3322732476784125, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.06388473510742, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.870847761631012, "num_tokens": 399732875.0, "step": 10473 }, { "epoch": 1.3324004579570028, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.67585754394531, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8696099519729614, "num_tokens": 399769537.0, "step": 10474 }, { "epoch": 1.3325276682355933, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.136749267578125, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8439427614212036, "num_tokens": 399807757.0, "step": 10475 }, { "epoch": 1.3326548785141838, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.8828239440918, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8662595748901367, "num_tokens": 399837351.0, "step": 10476 }, { "epoch": 1.3327820887927744, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.85502243041992, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8784856796264648, "num_tokens": 399873678.0, "step": 10477 }, { "epoch": 1.332909299071365, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.636817932128906, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8612788319587708, "num_tokens": 399916206.0, "step": 10478 }, { "epoch": 1.3330365093499554, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.87120819091797, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8713767528533936, "num_tokens": 399954144.0, "step": 10479 }, { "epoch": 1.333163719628546, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.43976974487305, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8674994707107544, "num_tokens": 399995865.0, "step": 10480 }, { "epoch": 1.3332909299071365, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.181556701660156, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8612595200538635, "num_tokens": 400033234.0, "step": 10481 }, { "epoch": 1.333418140185727, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.58799743652344, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8683476448059082, "num_tokens": 400066842.0, "step": 10482 }, { "epoch": 1.3335453504643175, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.8446159362793, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8696895837783813, "num_tokens": 400099380.0, "step": 10483 }, { "epoch": 1.333672560742908, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.95023727416992, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8715686798095703, "num_tokens": 400137419.0, "step": 10484 }, { "epoch": 1.3337997710214986, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.726043701171875, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8608239889144897, "num_tokens": 400171086.0, "step": 10485 }, { "epoch": 1.3339269813000891, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.05414581298828, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.858248233795166, "num_tokens": 400206763.0, "step": 10486 }, { "epoch": 1.3340541915786797, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.421688079833984, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.844356119632721, "num_tokens": 400242708.0, "step": 10487 }, { "epoch": 1.3341814018572702, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.284568786621094, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8399732708930969, "num_tokens": 400277417.0, "step": 10488 }, { "epoch": 1.3343086121358605, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.661338806152344, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8584243059158325, "num_tokens": 400315811.0, "step": 10489 }, { "epoch": 1.334435822414451, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.883544921875, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8407248258590698, "num_tokens": 400356321.0, "step": 10490 }, { "epoch": 1.3345630326930416, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.61100769042969, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8590408563613892, "num_tokens": 400392382.0, "step": 10491 }, { "epoch": 1.334690242971632, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.96333312988281, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8672813177108765, "num_tokens": 400434947.0, "step": 10492 }, { "epoch": 1.3348174532502226, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.79202651977539, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8417603969573975, "num_tokens": 400472742.0, "step": 10493 }, { "epoch": 1.3349446635288131, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.749237060546875, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8733494281768799, "num_tokens": 400510029.0, "step": 10494 }, { "epoch": 1.3350718738074037, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.76091384887695, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8567475080490112, "num_tokens": 400547383.0, "step": 10495 }, { "epoch": 1.3351990840859942, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.994163513183594, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8709878325462341, "num_tokens": 400585242.0, "step": 10496 }, { "epoch": 1.3353262943645847, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.1526985168457, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8712687492370605, "num_tokens": 400623117.0, "step": 10497 }, { "epoch": 1.3354535046431752, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.95193862915039, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8498178720474243, "num_tokens": 400660170.0, "step": 10498 }, { "epoch": 1.3355807149217656, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.38575744628906, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8663410544395447, "num_tokens": 400700467.0, "step": 10499 }, { "epoch": 1.335707925200356, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.5965461730957, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8534479141235352, "num_tokens": 400740269.0, "step": 10500 }, { "epoch": 1.3358351354789466, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.23169708251953, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8706623315811157, "num_tokens": 400776465.0, "step": 10501 }, { "epoch": 1.3359623457575371, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.02824401855469, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8556185960769653, "num_tokens": 400811210.0, "step": 10502 }, { "epoch": 1.3360895560361277, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.09515380859375, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8537570238113403, "num_tokens": 400848820.0, "step": 10503 }, { "epoch": 1.3362167663147182, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.73918533325195, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8619087934494019, "num_tokens": 400888774.0, "step": 10504 }, { "epoch": 1.3363439765933087, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.617431640625, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8618754148483276, "num_tokens": 400935067.0, "step": 10505 }, { "epoch": 1.3364711868718993, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.58064651489258, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8698947429656982, "num_tokens": 400974122.0, "step": 10506 }, { "epoch": 1.3365983971504898, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.42829895019531, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8692470192909241, "num_tokens": 401008911.0, "step": 10507 }, { "epoch": 1.3367256074290803, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.653560638427734, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8655916452407837, "num_tokens": 401039453.0, "step": 10508 }, { "epoch": 1.3368528177076708, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.495792388916016, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8696331977844238, "num_tokens": 401075854.0, "step": 10509 }, { "epoch": 1.3369800279862614, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.38642120361328, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8598654270172119, "num_tokens": 401114922.0, "step": 10510 }, { "epoch": 1.337107238264852, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.43589401245117, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8684946894645691, "num_tokens": 401155833.0, "step": 10511 }, { "epoch": 1.3372344485434424, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.38540267944336, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8662194013595581, "num_tokens": 401196971.0, "step": 10512 }, { "epoch": 1.337361658822033, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.20746994018555, "learning_rate": 1e-06, "loss": 0.6483, "mean_token_accuracy": 0.8284526467323303, "num_tokens": 401237312.0, "step": 10513 }, { "epoch": 1.3374888691006233, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.13706970214844, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.857101559638977, "num_tokens": 401274630.0, "step": 10514 }, { "epoch": 1.3376160793792138, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.99287414550781, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8656586408615112, "num_tokens": 401311780.0, "step": 10515 }, { "epoch": 1.3377432896578043, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.110877990722656, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.86692875623703, "num_tokens": 401352614.0, "step": 10516 }, { "epoch": 1.3378704999363948, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.13694381713867, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8637064695358276, "num_tokens": 401394495.0, "step": 10517 }, { "epoch": 1.3379977102149854, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.814422607421875, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8460640907287598, "num_tokens": 401427758.0, "step": 10518 }, { "epoch": 1.338124920493576, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.38546371459961, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8450185060501099, "num_tokens": 401473868.0, "step": 10519 }, { "epoch": 1.3382521307721664, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.62504959106445, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8421204090118408, "num_tokens": 401510459.0, "step": 10520 }, { "epoch": 1.338379341050757, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.24909973144531, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.8310529589653015, "num_tokens": 401545283.0, "step": 10521 }, { "epoch": 1.3385065513293475, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.76774597167969, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8607081174850464, "num_tokens": 401577166.0, "step": 10522 }, { "epoch": 1.3386337616079378, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.147403717041016, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8722127676010132, "num_tokens": 401619075.0, "step": 10523 }, { "epoch": 1.3387609718865283, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.99876022338867, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8538039922714233, "num_tokens": 401661286.0, "step": 10524 }, { "epoch": 1.3388881821651188, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.10063934326172, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8506443500518799, "num_tokens": 401699672.0, "step": 10525 }, { "epoch": 1.3390153924437094, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.93810272216797, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8526769876480103, "num_tokens": 401732676.0, "step": 10526 }, { "epoch": 1.3391426027223, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.09407424926758, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.857346773147583, "num_tokens": 401771593.0, "step": 10527 }, { "epoch": 1.3392698130008904, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.87614440917969, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.844841718673706, "num_tokens": 401809605.0, "step": 10528 }, { "epoch": 1.339397023279481, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.17034149169922, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8519341945648193, "num_tokens": 401845349.0, "step": 10529 }, { "epoch": 1.3395242335580715, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.88047409057617, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8824265599250793, "num_tokens": 401883186.0, "step": 10530 }, { "epoch": 1.339651443836662, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.20493698120117, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8453284502029419, "num_tokens": 401921149.0, "step": 10531 }, { "epoch": 1.3397786541152525, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.759864807128906, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8533728718757629, "num_tokens": 401961813.0, "step": 10532 }, { "epoch": 1.339905864393843, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.17646789550781, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8419196605682373, "num_tokens": 402001212.0, "step": 10533 }, { "epoch": 1.3400330746724336, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.68476486206055, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8532111644744873, "num_tokens": 402044648.0, "step": 10534 }, { "epoch": 1.3401602849510241, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.166229248046875, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8611907958984375, "num_tokens": 402087391.0, "step": 10535 }, { "epoch": 1.3402874952296147, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.784236907958984, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8792351484298706, "num_tokens": 402119598.0, "step": 10536 }, { "epoch": 1.3404147055082052, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.267147064208984, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8337475657463074, "num_tokens": 402153628.0, "step": 10537 }, { "epoch": 1.3405419157867955, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.671905517578125, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8770228028297424, "num_tokens": 402190876.0, "step": 10538 }, { "epoch": 1.340669126065386, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.70660400390625, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8678790330886841, "num_tokens": 402227168.0, "step": 10539 }, { "epoch": 1.3407963363439765, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.07854080200195, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8683390617370605, "num_tokens": 402262731.0, "step": 10540 }, { "epoch": 1.340923546622567, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.713966369628906, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8619108200073242, "num_tokens": 402299992.0, "step": 10541 }, { "epoch": 1.3410507569011576, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.782203674316406, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8509297966957092, "num_tokens": 402339129.0, "step": 10542 }, { "epoch": 1.3411779671797481, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.60688781738281, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8414230942726135, "num_tokens": 402375380.0, "step": 10543 }, { "epoch": 1.3413051774583387, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.850807189941406, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8492981195449829, "num_tokens": 402412159.0, "step": 10544 }, { "epoch": 1.3414323877369292, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.66004943847656, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.855436384677887, "num_tokens": 402446676.0, "step": 10545 }, { "epoch": 1.3415595980155197, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.793373107910156, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8616732358932495, "num_tokens": 402490741.0, "step": 10546 }, { "epoch": 1.3416868082941102, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.80167007446289, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.868439793586731, "num_tokens": 402527398.0, "step": 10547 }, { "epoch": 1.3418140185727006, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.84475326538086, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8573809862136841, "num_tokens": 402565914.0, "step": 10548 }, { "epoch": 1.341941228851291, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.864871978759766, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8638292551040649, "num_tokens": 402600671.0, "step": 10549 }, { "epoch": 1.3420684391298816, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.07175827026367, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.872840166091919, "num_tokens": 402637381.0, "step": 10550 }, { "epoch": 1.3421956494084721, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.45370101928711, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8646074533462524, "num_tokens": 402671190.0, "step": 10551 }, { "epoch": 1.3423228596870627, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.8863525390625, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8607975840568542, "num_tokens": 402721518.0, "step": 10552 }, { "epoch": 1.3424500699656532, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.58598327636719, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8599542379379272, "num_tokens": 402758949.0, "step": 10553 }, { "epoch": 1.3425772802442437, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.833988189697266, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8539878726005554, "num_tokens": 402794342.0, "step": 10554 }, { "epoch": 1.3427044905228342, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.0280647277832, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8706756234169006, "num_tokens": 402831031.0, "step": 10555 }, { "epoch": 1.3428317008014248, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.13652801513672, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8712354302406311, "num_tokens": 402871486.0, "step": 10556 }, { "epoch": 1.3429589110800153, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.52830123901367, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8713493943214417, "num_tokens": 402916024.0, "step": 10557 }, { "epoch": 1.3430861213586058, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.86709213256836, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8421871662139893, "num_tokens": 402952116.0, "step": 10558 }, { "epoch": 1.3432133316371964, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.44178009033203, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8571949005126953, "num_tokens": 402998064.0, "step": 10559 }, { "epoch": 1.343340541915787, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.27717208862305, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8652619123458862, "num_tokens": 403041763.0, "step": 10560 }, { "epoch": 1.3434677521943774, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.26005172729492, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8718801736831665, "num_tokens": 403080956.0, "step": 10561 }, { "epoch": 1.343594962472968, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 38.93461227416992, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8701533675193787, "num_tokens": 403117200.0, "step": 10562 }, { "epoch": 1.3437221727515583, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.10536575317383, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8612216711044312, "num_tokens": 403154112.0, "step": 10563 }, { "epoch": 1.3438493830301488, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.333984375, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8568677306175232, "num_tokens": 403188265.0, "step": 10564 }, { "epoch": 1.3439765933087393, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.1331672668457, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.837236762046814, "num_tokens": 403231236.0, "step": 10565 }, { "epoch": 1.3441038035873298, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.26801300048828, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8618981838226318, "num_tokens": 403268642.0, "step": 10566 }, { "epoch": 1.3442310138659204, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.96254348754883, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8681774139404297, "num_tokens": 403307679.0, "step": 10567 }, { "epoch": 1.344358224144511, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.504268646240234, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8653906583786011, "num_tokens": 403346395.0, "step": 10568 }, { "epoch": 1.3444854344231014, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.39670181274414, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8675307631492615, "num_tokens": 403383001.0, "step": 10569 }, { "epoch": 1.344612644701692, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.220577239990234, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8505082130432129, "num_tokens": 403426090.0, "step": 10570 }, { "epoch": 1.3447398549802825, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.18070983886719, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8529919385910034, "num_tokens": 403470231.0, "step": 10571 }, { "epoch": 1.3448670652588728, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.45823669433594, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8663755655288696, "num_tokens": 403508677.0, "step": 10572 }, { "epoch": 1.3449942755374633, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.69340515136719, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8411707878112793, "num_tokens": 403545175.0, "step": 10573 }, { "epoch": 1.3451214858160538, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.099700927734375, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8742872476577759, "num_tokens": 403583321.0, "step": 10574 }, { "epoch": 1.3452486960946444, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.97467803955078, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8522027730941772, "num_tokens": 403618547.0, "step": 10575 }, { "epoch": 1.345375906373235, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 38.50345230102539, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8647480607032776, "num_tokens": 403657552.0, "step": 10576 }, { "epoch": 1.3455031166518254, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 40.0323600769043, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8623567223548889, "num_tokens": 403697021.0, "step": 10577 }, { "epoch": 1.345630326930416, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.262027740478516, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8499574661254883, "num_tokens": 403736480.0, "step": 10578 }, { "epoch": 1.3457575372090065, "ewc_loss": 0.130859375, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.631202697753906, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8683453798294067, "num_tokens": 403772056.0, "step": 10579 }, { "epoch": 1.345884747487597, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001049041748046875, "grad_norm": 38.864654541015625, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.854655385017395, "num_tokens": 403813646.0, "step": 10580 }, { "epoch": 1.3460119577661875, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.259498596191406, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8640326857566833, "num_tokens": 403853904.0, "step": 10581 }, { "epoch": 1.346139168044778, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.02359390258789, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8431007862091064, "num_tokens": 403898528.0, "step": 10582 }, { "epoch": 1.3462663783233686, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.24700164794922, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8461542129516602, "num_tokens": 403929518.0, "step": 10583 }, { "epoch": 1.3463935886019591, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.2219123840332, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.861986517906189, "num_tokens": 403970323.0, "step": 10584 }, { "epoch": 1.3465207988805497, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.00624084472656, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8651018738746643, "num_tokens": 404012499.0, "step": 10585 }, { "epoch": 1.3466480091591402, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.149444580078125, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8557374477386475, "num_tokens": 404055370.0, "step": 10586 }, { "epoch": 1.3467752194377305, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.96932601928711, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8622610569000244, "num_tokens": 404092498.0, "step": 10587 }, { "epoch": 1.346902429716321, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.475975036621094, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8653943538665771, "num_tokens": 404132466.0, "step": 10588 }, { "epoch": 1.3470296399949115, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.893157958984375, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.860224187374115, "num_tokens": 404171913.0, "step": 10589 }, { "epoch": 1.347156850273502, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.745513916015625, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8735940456390381, "num_tokens": 404205817.0, "step": 10590 }, { "epoch": 1.3472840605520926, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.0060920715332, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8631651401519775, "num_tokens": 404243922.0, "step": 10591 }, { "epoch": 1.3474112708306831, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.52598571777344, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8636541366577148, "num_tokens": 404283392.0, "step": 10592 }, { "epoch": 1.3475384811092737, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.982093811035156, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.8346840143203735, "num_tokens": 404323044.0, "step": 10593 }, { "epoch": 1.3476656913878642, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.503719329833984, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8653370141983032, "num_tokens": 404359761.0, "step": 10594 }, { "epoch": 1.3477929016664547, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.86669158935547, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8460913896560669, "num_tokens": 404395354.0, "step": 10595 }, { "epoch": 1.3479201119450452, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.54906463623047, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8430063724517822, "num_tokens": 404432052.0, "step": 10596 }, { "epoch": 1.3480473222236355, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.17186737060547, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8531309366226196, "num_tokens": 404472689.0, "step": 10597 }, { "epoch": 1.348174532502226, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.375492095947266, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8442662954330444, "num_tokens": 404515916.0, "step": 10598 }, { "epoch": 1.3483017427808166, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.00393295288086, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8583369851112366, "num_tokens": 404557399.0, "step": 10599 }, { "epoch": 1.3484289530594071, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.72658157348633, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8561927676200867, "num_tokens": 404592573.0, "step": 10600 }, { "epoch": 1.3485561633379977, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.79104232788086, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8563780784606934, "num_tokens": 404635266.0, "step": 10601 }, { "epoch": 1.3486833736165882, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.970977783203125, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8686215281486511, "num_tokens": 404673004.0, "step": 10602 }, { "epoch": 1.3488105838951787, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.631961822509766, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8446532487869263, "num_tokens": 404705153.0, "step": 10603 }, { "epoch": 1.3489377941737692, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.57439041137695, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8507019281387329, "num_tokens": 404737068.0, "step": 10604 }, { "epoch": 1.3490650044523598, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.529258728027344, "learning_rate": 1e-06, "loss": 0.6196, "mean_token_accuracy": 0.8394129276275635, "num_tokens": 404773680.0, "step": 10605 }, { "epoch": 1.3491922147309503, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.99413299560547, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8646550178527832, "num_tokens": 404809937.0, "step": 10606 }, { "epoch": 1.3493194250095408, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.1730842590332, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8509000539779663, "num_tokens": 404847146.0, "step": 10607 }, { "epoch": 1.3494466352881314, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.11764144897461, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8607746362686157, "num_tokens": 404882952.0, "step": 10608 }, { "epoch": 1.3495738455667219, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.0844612121582, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8695980310440063, "num_tokens": 404925347.0, "step": 10609 }, { "epoch": 1.3497010558453124, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.19789505004883, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8446454405784607, "num_tokens": 404963411.0, "step": 10610 }, { "epoch": 1.349828266123903, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.30533218383789, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8577845096588135, "num_tokens": 404997522.0, "step": 10611 }, { "epoch": 1.3499554764024932, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.205074310302734, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8573217391967773, "num_tokens": 405034049.0, "step": 10612 }, { "epoch": 1.3500826866810838, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.32841110229492, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8560522794723511, "num_tokens": 405074985.0, "step": 10613 }, { "epoch": 1.3502098969596743, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.71890640258789, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8748048543930054, "num_tokens": 405116133.0, "step": 10614 }, { "epoch": 1.3503371072382648, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.56459426879883, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.858600378036499, "num_tokens": 405155572.0, "step": 10615 }, { "epoch": 1.3504643175168554, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.42795944213867, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8631633520126343, "num_tokens": 405194700.0, "step": 10616 }, { "epoch": 1.350591527795446, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.20259475708008, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8548098802566528, "num_tokens": 405233582.0, "step": 10617 }, { "epoch": 1.3507187380740364, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.34271240234375, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8708990216255188, "num_tokens": 405274987.0, "step": 10618 }, { "epoch": 1.350845948352627, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.531028747558594, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8561691045761108, "num_tokens": 405308151.0, "step": 10619 }, { "epoch": 1.3509731586312175, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.900882720947266, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8582868576049805, "num_tokens": 405343939.0, "step": 10620 }, { "epoch": 1.3511003689098078, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.3444938659668, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8585363030433655, "num_tokens": 405384211.0, "step": 10621 }, { "epoch": 1.3512275791883983, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.230560302734375, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8659242391586304, "num_tokens": 405419971.0, "step": 10622 }, { "epoch": 1.3513547894669888, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.35420227050781, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8516907691955566, "num_tokens": 405451322.0, "step": 10623 }, { "epoch": 1.3514819997455794, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.346378326416016, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8514859676361084, "num_tokens": 405496101.0, "step": 10624 }, { "epoch": 1.35160921002417, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.810726165771484, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8434723615646362, "num_tokens": 405530034.0, "step": 10625 }, { "epoch": 1.3517364203027604, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.34720993041992, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8709453344345093, "num_tokens": 405570893.0, "step": 10626 }, { "epoch": 1.351863630581351, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.278892517089844, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8462573885917664, "num_tokens": 405610120.0, "step": 10627 }, { "epoch": 1.3519908408599415, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.364681243896484, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8566445112228394, "num_tokens": 405651878.0, "step": 10628 }, { "epoch": 1.352118051138532, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.18159103393555, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8651894330978394, "num_tokens": 405694578.0, "step": 10629 }, { "epoch": 1.3522452614171225, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.26424026489258, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.865505576133728, "num_tokens": 405734128.0, "step": 10630 }, { "epoch": 1.352372471695713, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.263702392578125, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8649027347564697, "num_tokens": 405776588.0, "step": 10631 }, { "epoch": 1.3524996819743036, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.565818786621094, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8558341264724731, "num_tokens": 405810867.0, "step": 10632 }, { "epoch": 1.3526268922528941, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.13385772705078, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8774200677871704, "num_tokens": 405848884.0, "step": 10633 }, { "epoch": 1.3527541025314846, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.507240295410156, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.87907475233078, "num_tokens": 405888759.0, "step": 10634 }, { "epoch": 1.3528813128100752, "ewc_loss": 0.125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 40.34764862060547, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8724900484085083, "num_tokens": 405925686.0, "step": 10635 }, { "epoch": 1.3530085230886655, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 39.456504821777344, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8720318078994751, "num_tokens": 405960836.0, "step": 10636 }, { "epoch": 1.353135733367256, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.44148254394531, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8712465763092041, "num_tokens": 406000314.0, "step": 10637 }, { "epoch": 1.3532629436458465, "ewc_loss": 0.125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010538101196289062, "grad_norm": 39.034095764160156, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8688659071922302, "num_tokens": 406039468.0, "step": 10638 }, { "epoch": 1.353390153924437, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.62478256225586, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.883298397064209, "num_tokens": 406076828.0, "step": 10639 }, { "epoch": 1.3535173642030276, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.33932876586914, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8593964576721191, "num_tokens": 406116504.0, "step": 10640 }, { "epoch": 1.3536445744816181, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.3718147277832, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8586039543151855, "num_tokens": 406155184.0, "step": 10641 }, { "epoch": 1.3537717847602087, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.483489990234375, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8410919308662415, "num_tokens": 406195411.0, "step": 10642 }, { "epoch": 1.3538989950387992, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.93049240112305, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8505657911300659, "num_tokens": 406230159.0, "step": 10643 }, { "epoch": 1.3540262053173897, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.23793411254883, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.848642110824585, "num_tokens": 406264541.0, "step": 10644 }, { "epoch": 1.3541534155959802, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.33938217163086, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.866851806640625, "num_tokens": 406301869.0, "step": 10645 }, { "epoch": 1.3542806258745705, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.15889358520508, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8623210787773132, "num_tokens": 406338819.0, "step": 10646 }, { "epoch": 1.354407836153161, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.547027587890625, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8610140085220337, "num_tokens": 406375914.0, "step": 10647 }, { "epoch": 1.3545350464317516, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.48519515991211, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8658533692359924, "num_tokens": 406418543.0, "step": 10648 }, { "epoch": 1.3546622567103421, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.89645004272461, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8572551012039185, "num_tokens": 406455731.0, "step": 10649 }, { "epoch": 1.3547894669889327, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.85574722290039, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8598106503486633, "num_tokens": 406496266.0, "step": 10650 }, { "epoch": 1.3549166772675232, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.23444366455078, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8549928069114685, "num_tokens": 406541676.0, "step": 10651 }, { "epoch": 1.3550438875461137, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.5887336730957, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.855477511882782, "num_tokens": 406578997.0, "step": 10652 }, { "epoch": 1.3551710978247042, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.399864196777344, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8654111623764038, "num_tokens": 406619404.0, "step": 10653 }, { "epoch": 1.3552983081032948, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.59928894042969, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8788301944732666, "num_tokens": 406656092.0, "step": 10654 }, { "epoch": 1.3554255183818853, "ewc_loss": 0.1259765625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.97493362426758, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8438533544540405, "num_tokens": 406701600.0, "step": 10655 }, { "epoch": 1.3555527286604758, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.6799430847168, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8594629168510437, "num_tokens": 406739649.0, "step": 10656 }, { "epoch": 1.3556799389390664, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.09185791015625, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8783971667289734, "num_tokens": 406774962.0, "step": 10657 }, { "epoch": 1.3558071492176569, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.658931732177734, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8553857207298279, "num_tokens": 406818047.0, "step": 10658 }, { "epoch": 1.3559343594962474, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.157127380371094, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.852582573890686, "num_tokens": 406859464.0, "step": 10659 }, { "epoch": 1.356061569774838, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.30677795410156, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8582407832145691, "num_tokens": 406896444.0, "step": 10660 }, { "epoch": 1.3561887800534282, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.35485076904297, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8535211682319641, "num_tokens": 406934623.0, "step": 10661 }, { "epoch": 1.3563159903320188, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.75, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8536587357521057, "num_tokens": 406974290.0, "step": 10662 }, { "epoch": 1.3564432006106093, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.65694046020508, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8783268332481384, "num_tokens": 407011279.0, "step": 10663 }, { "epoch": 1.3565704108891998, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.82783508300781, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8487513065338135, "num_tokens": 407054733.0, "step": 10664 }, { "epoch": 1.3566976211677904, "ewc_loss": 0.1298828125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.6012077331543, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8611756563186646, "num_tokens": 407098410.0, "step": 10665 }, { "epoch": 1.3568248314463809, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.62901306152344, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8550577163696289, "num_tokens": 407134231.0, "step": 10666 }, { "epoch": 1.3569520417249714, "ewc_loss": 0.130859375, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.590885162353516, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8449733257293701, "num_tokens": 407167231.0, "step": 10667 }, { "epoch": 1.357079252003562, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.972007751464844, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8610502481460571, "num_tokens": 407206604.0, "step": 10668 }, { "epoch": 1.3572064622821525, "ewc_loss": 0.130859375, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.84453201293945, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8613702058792114, "num_tokens": 407248247.0, "step": 10669 }, { "epoch": 1.3573336725607428, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.2690315246582, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8699659109115601, "num_tokens": 407282870.0, "step": 10670 }, { "epoch": 1.3574608828393333, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.55401611328125, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.865394115447998, "num_tokens": 407318704.0, "step": 10671 }, { "epoch": 1.3575880931179238, "ewc_loss": 0.126953125, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.19156265258789, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8618727922439575, "num_tokens": 407360003.0, "step": 10672 }, { "epoch": 1.3577153033965144, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.5931510925293, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8531867265701294, "num_tokens": 407399473.0, "step": 10673 }, { "epoch": 1.357842513675105, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.33753204345703, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8289167881011963, "num_tokens": 407440576.0, "step": 10674 }, { "epoch": 1.3579697239536954, "ewc_loss": 0.12890625, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.72150421142578, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8556351661682129, "num_tokens": 407481057.0, "step": 10675 }, { "epoch": 1.358096934232286, "ewc_loss": 0.1279296875, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.22648620605469, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8716191649436951, "num_tokens": 407517104.0, "step": 10676 }, { "epoch": 1.3582241445108765, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.30216598510742, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.865044355392456, "num_tokens": 407552604.0, "step": 10677 }, { "epoch": 1.358351354789467, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.43107604980469, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8746585845947266, "num_tokens": 407588282.0, "step": 10678 }, { "epoch": 1.3584785650680575, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.53215789794922, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8552199006080627, "num_tokens": 407619837.0, "step": 10679 }, { "epoch": 1.358605775346648, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.419349670410156, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8564875721931458, "num_tokens": 407656939.0, "step": 10680 }, { "epoch": 1.3587329856252386, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.177616119384766, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.850109875202179, "num_tokens": 407702756.0, "step": 10681 }, { "epoch": 1.3588601959038291, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.64920425415039, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8677487969398499, "num_tokens": 407736142.0, "step": 10682 }, { "epoch": 1.3589874061824196, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.05601501464844, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8569602370262146, "num_tokens": 407773184.0, "step": 10683 }, { "epoch": 1.3591146164610102, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.99945068359375, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8431553244590759, "num_tokens": 407806543.0, "step": 10684 }, { "epoch": 1.3592418267396005, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.33740234375, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8579525351524353, "num_tokens": 407841371.0, "step": 10685 }, { "epoch": 1.359369037018191, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.76473617553711, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8644161224365234, "num_tokens": 407879862.0, "step": 10686 }, { "epoch": 1.3594962472967815, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.93952178955078, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8536425828933716, "num_tokens": 407917947.0, "step": 10687 }, { "epoch": 1.359623457575372, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.76352310180664, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8667677640914917, "num_tokens": 407953987.0, "step": 10688 }, { "epoch": 1.3597506678539626, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.95977783203125, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8429237604141235, "num_tokens": 407993166.0, "step": 10689 }, { "epoch": 1.3598778781325531, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.07845687866211, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8689039945602417, "num_tokens": 408037063.0, "step": 10690 }, { "epoch": 1.3600050884111436, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 40.3095703125, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8617345690727234, "num_tokens": 408078526.0, "step": 10691 }, { "epoch": 1.3601322986897342, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.8215446472168, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8679459095001221, "num_tokens": 408116002.0, "step": 10692 }, { "epoch": 1.3602595089683247, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.30353927612305, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8554404377937317, "num_tokens": 408155885.0, "step": 10693 }, { "epoch": 1.3603867192469152, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 38.79819869995117, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8649832606315613, "num_tokens": 408190085.0, "step": 10694 }, { "epoch": 1.3605139295255055, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.55671691894531, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8611516952514648, "num_tokens": 408227289.0, "step": 10695 }, { "epoch": 1.360641139804096, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.00111770629883, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8669286370277405, "num_tokens": 408271853.0, "step": 10696 }, { "epoch": 1.3607683500826866, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.45995330810547, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8734104037284851, "num_tokens": 408311133.0, "step": 10697 }, { "epoch": 1.3608955603612771, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 38.99623107910156, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8608765006065369, "num_tokens": 408342775.0, "step": 10698 }, { "epoch": 1.3610227706398677, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.25508499145508, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8739203214645386, "num_tokens": 408379063.0, "step": 10699 }, { "epoch": 1.3611499809184582, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010633468627929688, "grad_norm": 39.05087661743164, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8461544513702393, "num_tokens": 408415856.0, "step": 10700 }, { "epoch": 1.3612771911970487, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.120052337646484, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.86480712890625, "num_tokens": 408451522.0, "step": 10701 }, { "epoch": 1.3614044014756392, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.21660232543945, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8658470511436462, "num_tokens": 408494605.0, "step": 10702 }, { "epoch": 1.3615316117542298, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.94873046875, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8591081500053406, "num_tokens": 408536373.0, "step": 10703 }, { "epoch": 1.3616588220328203, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.046875, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8478068113327026, "num_tokens": 408574724.0, "step": 10704 }, { "epoch": 1.3617860323114108, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.884742736816406, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8517606258392334, "num_tokens": 408615057.0, "step": 10705 }, { "epoch": 1.3619132425900013, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.329078674316406, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8481870293617249, "num_tokens": 408651324.0, "step": 10706 }, { "epoch": 1.3620404528685919, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.592750549316406, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8656449317932129, "num_tokens": 408690533.0, "step": 10707 }, { "epoch": 1.3621676631471824, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.4869384765625, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8519177436828613, "num_tokens": 408725970.0, "step": 10708 }, { "epoch": 1.362294873425773, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.23204040527344, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8684051036834717, "num_tokens": 408766550.0, "step": 10709 }, { "epoch": 1.3624220837043632, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.53558349609375, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8629979491233826, "num_tokens": 408804320.0, "step": 10710 }, { "epoch": 1.3625492939829538, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.53142166137695, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8630908131599426, "num_tokens": 408840902.0, "step": 10711 }, { "epoch": 1.3626765042615443, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.45600509643555, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8588857650756836, "num_tokens": 408879346.0, "step": 10712 }, { "epoch": 1.3628037145401348, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.247684478759766, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8729761838912964, "num_tokens": 408919336.0, "step": 10713 }, { "epoch": 1.3629309248187254, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.19485855102539, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8551961183547974, "num_tokens": 408961547.0, "step": 10714 }, { "epoch": 1.3630581350973159, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.512115478515625, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8744405508041382, "num_tokens": 409003573.0, "step": 10715 }, { "epoch": 1.3631853453759064, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.21943283081055, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.853125810623169, "num_tokens": 409045560.0, "step": 10716 }, { "epoch": 1.363312555654497, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.23350143432617, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8640601634979248, "num_tokens": 409087245.0, "step": 10717 }, { "epoch": 1.3634397659330875, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.76896286010742, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8698287010192871, "num_tokens": 409134177.0, "step": 10718 }, { "epoch": 1.3635669762116778, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.33987045288086, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8463139533996582, "num_tokens": 409169495.0, "step": 10719 }, { "epoch": 1.3636941864902683, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.54747009277344, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8368633985519409, "num_tokens": 409207781.0, "step": 10720 }, { "epoch": 1.3638213967688588, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.61318588256836, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.860307514667511, "num_tokens": 409245411.0, "step": 10721 }, { "epoch": 1.3639486070474494, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.24921798706055, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8683141469955444, "num_tokens": 409281757.0, "step": 10722 }, { "epoch": 1.3640758173260399, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.81976318359375, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8487335443496704, "num_tokens": 409321317.0, "step": 10723 }, { "epoch": 1.3642030276046304, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.91956329345703, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8685113191604614, "num_tokens": 409358919.0, "step": 10724 }, { "epoch": 1.364330237883221, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.71003341674805, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8522599935531616, "num_tokens": 409394355.0, "step": 10725 }, { "epoch": 1.3644574481618115, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 38.860660552978516, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8706721067428589, "num_tokens": 409433781.0, "step": 10726 }, { "epoch": 1.364584658440402, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.81333923339844, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8509925007820129, "num_tokens": 409471728.0, "step": 10727 }, { "epoch": 1.3647118687189925, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.93224334716797, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8779234886169434, "num_tokens": 409502294.0, "step": 10728 }, { "epoch": 1.364839078997583, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.931453704833984, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8609384298324585, "num_tokens": 409545878.0, "step": 10729 }, { "epoch": 1.3649662892761736, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.130836486816406, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8616793751716614, "num_tokens": 409584753.0, "step": 10730 }, { "epoch": 1.3650934995547641, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.37154769897461, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.865469217300415, "num_tokens": 409613725.0, "step": 10731 }, { "epoch": 1.3652207098333546, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.28617858886719, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8528109192848206, "num_tokens": 409652443.0, "step": 10732 }, { "epoch": 1.3653479201119452, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.24187469482422, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.867141604423523, "num_tokens": 409690683.0, "step": 10733 }, { "epoch": 1.3654751303905355, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.52742004394531, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8600125312805176, "num_tokens": 409727502.0, "step": 10734 }, { "epoch": 1.365602340669126, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.42885208129883, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8559539318084717, "num_tokens": 409764592.0, "step": 10735 }, { "epoch": 1.3657295509477165, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.68023681640625, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.83917635679245, "num_tokens": 409809645.0, "step": 10736 }, { "epoch": 1.365856761226307, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.538936614990234, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8581171631813049, "num_tokens": 409848048.0, "step": 10737 }, { "epoch": 1.3659839715048976, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.215675354003906, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8541767597198486, "num_tokens": 409884176.0, "step": 10738 }, { "epoch": 1.3661111817834881, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.93423080444336, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8618462085723877, "num_tokens": 409922630.0, "step": 10739 }, { "epoch": 1.3662383920620786, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.08924865722656, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8664451241493225, "num_tokens": 409961876.0, "step": 10740 }, { "epoch": 1.3663656023406692, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.87285232543945, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8686628341674805, "num_tokens": 409995432.0, "step": 10741 }, { "epoch": 1.3664928126192597, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 38.67782211303711, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8585267066955566, "num_tokens": 410034876.0, "step": 10742 }, { "epoch": 1.3666200228978502, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.5022087097168, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8624637722969055, "num_tokens": 410079190.0, "step": 10743 }, { "epoch": 1.3667472331764405, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 38.965599060058594, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8642337322235107, "num_tokens": 410112475.0, "step": 10744 }, { "epoch": 1.366874443455031, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.056514739990234, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8758370280265808, "num_tokens": 410154583.0, "step": 10745 }, { "epoch": 1.3670016537336216, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.134761810302734, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8608975410461426, "num_tokens": 410191631.0, "step": 10746 }, { "epoch": 1.3671288640122121, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.11089324951172, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8368320465087891, "num_tokens": 410232435.0, "step": 10747 }, { "epoch": 1.3672560742908026, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.39325714111328, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8649531602859497, "num_tokens": 410272940.0, "step": 10748 }, { "epoch": 1.3673832845693932, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.75140380859375, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8561591506004333, "num_tokens": 410312807.0, "step": 10749 }, { "epoch": 1.3675104948479837, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.68053436279297, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8665325045585632, "num_tokens": 410352433.0, "step": 10750 }, { "epoch": 1.3676377051265742, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.394718170166016, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8601521253585815, "num_tokens": 410394843.0, "step": 10751 }, { "epoch": 1.3677649154051648, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 40.06294631958008, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8623015880584717, "num_tokens": 410439353.0, "step": 10752 }, { "epoch": 1.3678921256837553, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.25577163696289, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8380519151687622, "num_tokens": 410485789.0, "step": 10753 }, { "epoch": 1.3680193359623458, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.149024963378906, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8433548212051392, "num_tokens": 410523385.0, "step": 10754 }, { "epoch": 1.3681465462409363, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.90373229980469, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8539847135543823, "num_tokens": 410553340.0, "step": 10755 }, { "epoch": 1.3682737565195269, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.31479263305664, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8685809969902039, "num_tokens": 410590041.0, "step": 10756 }, { "epoch": 1.3684009667981174, "ewc_loss": 0.1259765625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010585784912109375, "grad_norm": 39.077491760253906, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8603919744491577, "num_tokens": 410627703.0, "step": 10757 }, { "epoch": 1.368528177076708, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.80036544799805, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8521738648414612, "num_tokens": 410662555.0, "step": 10758 }, { "epoch": 1.3686553873552982, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.46281814575195, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8432990908622742, "num_tokens": 410706026.0, "step": 10759 }, { "epoch": 1.3687825976338888, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.63463592529297, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8569204211235046, "num_tokens": 410743275.0, "step": 10760 }, { "epoch": 1.3689098079124793, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.573394775390625, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8688872456550598, "num_tokens": 410781220.0, "step": 10761 }, { "epoch": 1.3690370181910698, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.89061737060547, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8684613108634949, "num_tokens": 410820014.0, "step": 10762 }, { "epoch": 1.3691642284696603, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.57761764526367, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8540841341018677, "num_tokens": 410858721.0, "step": 10763 }, { "epoch": 1.3692914387482509, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.5892219543457, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8645783066749573, "num_tokens": 410898527.0, "step": 10764 }, { "epoch": 1.3694186490268414, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 40.26872634887695, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8611176609992981, "num_tokens": 410938426.0, "step": 10765 }, { "epoch": 1.369545859305432, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.312110900878906, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8655705451965332, "num_tokens": 410969940.0, "step": 10766 }, { "epoch": 1.3696730695840225, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.0106086730957, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8452728986740112, "num_tokens": 411008859.0, "step": 10767 }, { "epoch": 1.3698002798626128, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.60553741455078, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8742154836654663, "num_tokens": 411043516.0, "step": 10768 }, { "epoch": 1.3699274901412033, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.518985748291016, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8531030416488647, "num_tokens": 411079092.0, "step": 10769 }, { "epoch": 1.3700547004197938, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.68241882324219, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8641864061355591, "num_tokens": 411117360.0, "step": 10770 }, { "epoch": 1.3701819106983844, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.53839111328125, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8496668934822083, "num_tokens": 411153615.0, "step": 10771 }, { "epoch": 1.3703091209769749, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.28298568725586, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8617480993270874, "num_tokens": 411193571.0, "step": 10772 }, { "epoch": 1.3704363312555654, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.454994201660156, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8403101563453674, "num_tokens": 411230427.0, "step": 10773 }, { "epoch": 1.370563541534156, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.793453216552734, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8571656346321106, "num_tokens": 411271251.0, "step": 10774 }, { "epoch": 1.3706907518127465, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.17195129394531, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8619130253791809, "num_tokens": 411304906.0, "step": 10775 }, { "epoch": 1.370817962091337, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.4952507019043, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8774746060371399, "num_tokens": 411341417.0, "step": 10776 }, { "epoch": 1.3709451723699275, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.7366943359375, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.862004280090332, "num_tokens": 411382118.0, "step": 10777 }, { "epoch": 1.371072382648518, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.066162109375, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8494499921798706, "num_tokens": 411426702.0, "step": 10778 }, { "epoch": 1.3711995929271086, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.975502014160156, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8577426075935364, "num_tokens": 411461689.0, "step": 10779 }, { "epoch": 1.371326803205699, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 38.99892807006836, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8613393306732178, "num_tokens": 411499613.0, "step": 10780 }, { "epoch": 1.3714540134842896, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.81037139892578, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8584319353103638, "num_tokens": 411538422.0, "step": 10781 }, { "epoch": 1.3715812237628802, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.10560607910156, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8726863265037537, "num_tokens": 411572401.0, "step": 10782 }, { "epoch": 1.3717084340414705, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.404056549072266, "learning_rate": 1e-06, "loss": 0.5217, "mean_token_accuracy": 0.8686317205429077, "num_tokens": 411611268.0, "step": 10783 }, { "epoch": 1.371835644320061, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.6185188293457, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8580589294433594, "num_tokens": 411648837.0, "step": 10784 }, { "epoch": 1.3719628545986515, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.563446044921875, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8629060983657837, "num_tokens": 411690327.0, "step": 10785 }, { "epoch": 1.372090064877242, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.80820083618164, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8588378429412842, "num_tokens": 411729665.0, "step": 10786 }, { "epoch": 1.3722172751558326, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.5306510925293, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8747509717941284, "num_tokens": 411764716.0, "step": 10787 }, { "epoch": 1.3723444854344231, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.31027603149414, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.862206220626831, "num_tokens": 411805560.0, "step": 10788 }, { "epoch": 1.3724716957130136, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.68429183959961, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8643958568572998, "num_tokens": 411841151.0, "step": 10789 }, { "epoch": 1.3725989059916042, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.34601974487305, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8652912378311157, "num_tokens": 411875088.0, "step": 10790 }, { "epoch": 1.3727261162701947, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.37128448486328, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8599961996078491, "num_tokens": 411914554.0, "step": 10791 }, { "epoch": 1.3728533265487852, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.655418395996094, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8614143133163452, "num_tokens": 411952237.0, "step": 10792 }, { "epoch": 1.3729805368273755, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.53013229370117, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8547527194023132, "num_tokens": 411989676.0, "step": 10793 }, { "epoch": 1.373107747105966, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.79130935668945, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8732666969299316, "num_tokens": 412025804.0, "step": 10794 }, { "epoch": 1.3732349573845566, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.447479248046875, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8749058246612549, "num_tokens": 412060085.0, "step": 10795 }, { "epoch": 1.3733621676631471, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.795352935791016, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8666133880615234, "num_tokens": 412100750.0, "step": 10796 }, { "epoch": 1.3734893779417376, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.69932174682617, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8630338311195374, "num_tokens": 412145829.0, "step": 10797 }, { "epoch": 1.3736165882203282, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.566837310791016, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8660529851913452, "num_tokens": 412187607.0, "step": 10798 }, { "epoch": 1.3737437984989187, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.896366119384766, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8468641638755798, "num_tokens": 412227214.0, "step": 10799 }, { "epoch": 1.3738710087775092, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.982913970947266, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8624692559242249, "num_tokens": 412261929.0, "step": 10800 }, { "epoch": 1.3739982190560998, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.530967712402344, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8544453382492065, "num_tokens": 412299441.0, "step": 10801 }, { "epoch": 1.3741254293346903, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.612178802490234, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.859194278717041, "num_tokens": 412339248.0, "step": 10802 }, { "epoch": 1.3742526396132808, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.71348190307617, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8590515851974487, "num_tokens": 412377224.0, "step": 10803 }, { "epoch": 1.3743798498918713, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.415855407714844, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8380517959594727, "num_tokens": 412415331.0, "step": 10804 }, { "epoch": 1.3745070601704619, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.36064910888672, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8593662977218628, "num_tokens": 412455604.0, "step": 10805 }, { "epoch": 1.3746342704490524, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.70291519165039, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8680887222290039, "num_tokens": 412489295.0, "step": 10806 }, { "epoch": 1.374761480727643, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.68560791015625, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8569507598876953, "num_tokens": 412525750.0, "step": 10807 }, { "epoch": 1.3748886910062332, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.12266159057617, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.86966872215271, "num_tokens": 412567163.0, "step": 10808 }, { "epoch": 1.3750159012848238, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.110172271728516, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8677688837051392, "num_tokens": 412598789.0, "step": 10809 }, { "epoch": 1.3751431115634143, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.28060531616211, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8631178140640259, "num_tokens": 412632588.0, "step": 10810 }, { "epoch": 1.3752703218420048, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.95555114746094, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8589175939559937, "num_tokens": 412676369.0, "step": 10811 }, { "epoch": 1.3753975321205953, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.225730895996094, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8746887445449829, "num_tokens": 412712674.0, "step": 10812 }, { "epoch": 1.3755247423991859, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.05342483520508, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8548167943954468, "num_tokens": 412753531.0, "step": 10813 }, { "epoch": 1.3756519526777764, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.42544174194336, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8576775193214417, "num_tokens": 412799450.0, "step": 10814 }, { "epoch": 1.375779162956367, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.16357421875, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8833063244819641, "num_tokens": 412836858.0, "step": 10815 }, { "epoch": 1.3759063732349575, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.742488861083984, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8630892038345337, "num_tokens": 412871169.0, "step": 10816 }, { "epoch": 1.3760335835135478, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.39690017700195, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.866609513759613, "num_tokens": 412909281.0, "step": 10817 }, { "epoch": 1.3761607937921383, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.693145751953125, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8601813912391663, "num_tokens": 412950126.0, "step": 10818 }, { "epoch": 1.3762880040707288, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.71564865112305, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8491495251655579, "num_tokens": 412988644.0, "step": 10819 }, { "epoch": 1.3764152143493193, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.573238372802734, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8453432321548462, "num_tokens": 413028403.0, "step": 10820 }, { "epoch": 1.3765424246279099, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.93812561035156, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8518865704536438, "num_tokens": 413067429.0, "step": 10821 }, { "epoch": 1.3766696349065004, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.44308853149414, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8570165634155273, "num_tokens": 413101630.0, "step": 10822 }, { "epoch": 1.376796845185091, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.889930725097656, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8545684814453125, "num_tokens": 413147757.0, "step": 10823 }, { "epoch": 1.3769240554636815, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 40.01598358154297, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.862927258014679, "num_tokens": 413186803.0, "step": 10824 }, { "epoch": 1.377051265742272, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.10710144042969, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8548308610916138, "num_tokens": 413227201.0, "step": 10825 }, { "epoch": 1.3771784760208625, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.271095275878906, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8585925698280334, "num_tokens": 413270594.0, "step": 10826 }, { "epoch": 1.377305686299453, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.333221435546875, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8420692682266235, "num_tokens": 413307345.0, "step": 10827 }, { "epoch": 1.3774328965780436, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.92844009399414, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8644154071807861, "num_tokens": 413344310.0, "step": 10828 }, { "epoch": 1.377560106856634, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.323585510253906, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8782386779785156, "num_tokens": 413382047.0, "step": 10829 }, { "epoch": 1.3776873171352246, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.808067321777344, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8584774732589722, "num_tokens": 413424658.0, "step": 10830 }, { "epoch": 1.3778145274138152, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.357059478759766, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8694899082183838, "num_tokens": 413458386.0, "step": 10831 }, { "epoch": 1.3779417376924055, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.16226577758789, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8468607664108276, "num_tokens": 413492530.0, "step": 10832 }, { "epoch": 1.378068947970996, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.032875061035156, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8706096410751343, "num_tokens": 413528894.0, "step": 10833 }, { "epoch": 1.3781961582495865, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.50537872314453, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8560128211975098, "num_tokens": 413564765.0, "step": 10834 }, { "epoch": 1.378323368528177, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 38.98845672607422, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8333388566970825, "num_tokens": 413598122.0, "step": 10835 }, { "epoch": 1.3784505788067676, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.587223052978516, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8444297313690186, "num_tokens": 413634519.0, "step": 10836 }, { "epoch": 1.378577789085358, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001068115234375, "grad_norm": 39.34138488769531, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8527992367744446, "num_tokens": 413674021.0, "step": 10837 }, { "epoch": 1.3787049993639486, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.21310043334961, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.86607825756073, "num_tokens": 413713804.0, "step": 10838 }, { "epoch": 1.3788322096425392, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.743125915527344, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8680181503295898, "num_tokens": 413755665.0, "step": 10839 }, { "epoch": 1.3789594199211297, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.061683654785156, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.858466625213623, "num_tokens": 413790872.0, "step": 10840 }, { "epoch": 1.3790866301997202, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 40.060768127441406, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8794127702713013, "num_tokens": 413830798.0, "step": 10841 }, { "epoch": 1.3792138404783105, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.73600769042969, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8648133277893066, "num_tokens": 413865960.0, "step": 10842 }, { "epoch": 1.379341050756901, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.18079376220703, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8697350025177002, "num_tokens": 413898222.0, "step": 10843 }, { "epoch": 1.3794682610354916, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.64374923706055, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8546143770217896, "num_tokens": 413931839.0, "step": 10844 }, { "epoch": 1.3795954713140821, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.1536865234375, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8513311743736267, "num_tokens": 413963815.0, "step": 10845 }, { "epoch": 1.3797226815926726, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.40238571166992, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8682335615158081, "num_tokens": 414002476.0, "step": 10846 }, { "epoch": 1.3798498918712632, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.92852783203125, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8563095331192017, "num_tokens": 414041121.0, "step": 10847 }, { "epoch": 1.3799771021498537, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.4366569519043, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8478860855102539, "num_tokens": 414080447.0, "step": 10848 }, { "epoch": 1.3801043124284442, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.279205322265625, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8476367592811584, "num_tokens": 414108925.0, "step": 10849 }, { "epoch": 1.3802315227070348, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.54780960083008, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8647246360778809, "num_tokens": 414154296.0, "step": 10850 }, { "epoch": 1.3803587329856253, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.168155670166016, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8490029573440552, "num_tokens": 414194612.0, "step": 10851 }, { "epoch": 1.3804859432642158, "ewc_loss": 0.126953125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010728836059570312, "grad_norm": 39.79719924926758, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8640786409378052, "num_tokens": 414229939.0, "step": 10852 }, { "epoch": 1.3806131535428063, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 40.063533782958984, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8564022779464722, "num_tokens": 414265193.0, "step": 10853 }, { "epoch": 1.3807403638213969, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.457862854003906, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8573262095451355, "num_tokens": 414301471.0, "step": 10854 }, { "epoch": 1.3808675740999874, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 40.23247528076172, "learning_rate": 1e-06, "loss": 0.6279, "mean_token_accuracy": 0.8327561616897583, "num_tokens": 414338755.0, "step": 10855 }, { "epoch": 1.380994784378578, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.628746032714844, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.863152265548706, "num_tokens": 414379996.0, "step": 10856 }, { "epoch": 1.3811219946571682, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.81351852416992, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8625203967094421, "num_tokens": 414421905.0, "step": 10857 }, { "epoch": 1.3812492049357588, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.64053726196289, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8654335737228394, "num_tokens": 414465149.0, "step": 10858 }, { "epoch": 1.3813764152143493, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.947181701660156, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8697090744972229, "num_tokens": 414504086.0, "step": 10859 }, { "epoch": 1.3815036254929398, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.443687438964844, "learning_rate": 1e-06, "loss": 0.6088, "mean_token_accuracy": 0.8433659076690674, "num_tokens": 414545392.0, "step": 10860 }, { "epoch": 1.3816308357715303, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.15826416015625, "learning_rate": 1e-06, "loss": 0.654, "mean_token_accuracy": 0.8236339688301086, "num_tokens": 414586445.0, "step": 10861 }, { "epoch": 1.3817580460501209, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.581966400146484, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8701818585395813, "num_tokens": 414618467.0, "step": 10862 }, { "epoch": 1.3818852563287114, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.122859954833984, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8712424635887146, "num_tokens": 414651253.0, "step": 10863 }, { "epoch": 1.382012466607302, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.470638275146484, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8599355220794678, "num_tokens": 414692016.0, "step": 10864 }, { "epoch": 1.3821396768858925, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.98603057861328, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8658050894737244, "num_tokens": 414731840.0, "step": 10865 }, { "epoch": 1.3822668871644828, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.45446014404297, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8620480298995972, "num_tokens": 414767110.0, "step": 10866 }, { "epoch": 1.3823940974430733, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.83327102661133, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8645702600479126, "num_tokens": 414799340.0, "step": 10867 }, { "epoch": 1.3825213077216638, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.855289459228516, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8630355596542358, "num_tokens": 414840915.0, "step": 10868 }, { "epoch": 1.3826485180002543, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.724735260009766, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8474751114845276, "num_tokens": 414877298.0, "step": 10869 }, { "epoch": 1.3827757282788449, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.597023010253906, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8657631278038025, "num_tokens": 414916431.0, "step": 10870 }, { "epoch": 1.3829029385574354, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.84144592285156, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8587261438369751, "num_tokens": 414958148.0, "step": 10871 }, { "epoch": 1.383030148836026, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.07525634765625, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8849108815193176, "num_tokens": 414997008.0, "step": 10872 }, { "epoch": 1.3831573591146165, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.97645950317383, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.848071813583374, "num_tokens": 415042893.0, "step": 10873 }, { "epoch": 1.383284569393207, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.865238189697266, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8582891225814819, "num_tokens": 415077476.0, "step": 10874 }, { "epoch": 1.3834117796717975, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.75543975830078, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8560658693313599, "num_tokens": 415124329.0, "step": 10875 }, { "epoch": 1.383538989950388, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 40.101009368896484, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8718365430831909, "num_tokens": 415164101.0, "step": 10876 }, { "epoch": 1.3836662002289786, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.81828308105469, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8615741729736328, "num_tokens": 415203792.0, "step": 10877 }, { "epoch": 1.383793410507569, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.774295806884766, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8569847941398621, "num_tokens": 415233754.0, "step": 10878 }, { "epoch": 1.3839206207861596, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.817264556884766, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8689994812011719, "num_tokens": 415269567.0, "step": 10879 }, { "epoch": 1.3840478310647502, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.602840423583984, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8641535639762878, "num_tokens": 415312660.0, "step": 10880 }, { "epoch": 1.3841750413433405, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.97238540649414, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8708454370498657, "num_tokens": 415355377.0, "step": 10881 }, { "epoch": 1.384302251621931, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.31195068359375, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.871557891368866, "num_tokens": 415393109.0, "step": 10882 }, { "epoch": 1.3844294619005215, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.88572311401367, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8725597262382507, "num_tokens": 415428650.0, "step": 10883 }, { "epoch": 1.384556672179112, "ewc_loss": 0.1279296875, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010776519775390625, "grad_norm": 39.65834045410156, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8579230308532715, "num_tokens": 415467166.0, "step": 10884 }, { "epoch": 1.3846838824577026, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.62939453125, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8733832836151123, "num_tokens": 415505633.0, "step": 10885 }, { "epoch": 1.384811092736293, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.800262451171875, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8604570031166077, "num_tokens": 415546254.0, "step": 10886 }, { "epoch": 1.3849383030148836, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.55418395996094, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8586016297340393, "num_tokens": 415584999.0, "step": 10887 }, { "epoch": 1.3850655132934742, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.81356430053711, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8619948625564575, "num_tokens": 415624608.0, "step": 10888 }, { "epoch": 1.3851927235720647, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.77265167236328, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8472462892532349, "num_tokens": 415663405.0, "step": 10889 }, { "epoch": 1.385319933850655, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.71299362182617, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8755792379379272, "num_tokens": 415698934.0, "step": 10890 }, { "epoch": 1.3854471441292455, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.891334533691406, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8307887315750122, "num_tokens": 415736973.0, "step": 10891 }, { "epoch": 1.385574354407836, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.9534797668457, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8635287284851074, "num_tokens": 415778364.0, "step": 10892 }, { "epoch": 1.3857015646864266, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.329856872558594, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8769307732582092, "num_tokens": 415817502.0, "step": 10893 }, { "epoch": 1.385828774965017, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.91026306152344, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8552024364471436, "num_tokens": 415860364.0, "step": 10894 }, { "epoch": 1.3859559852436076, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.03767395019531, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.863084077835083, "num_tokens": 415896816.0, "step": 10895 }, { "epoch": 1.3860831955221982, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.39704513549805, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8411096930503845, "num_tokens": 415935173.0, "step": 10896 }, { "epoch": 1.3862104058007887, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.90410614013672, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8701819181442261, "num_tokens": 415977043.0, "step": 10897 }, { "epoch": 1.3863376160793792, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.00923538208008, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8704500198364258, "num_tokens": 416014376.0, "step": 10898 }, { "epoch": 1.3864648263579697, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.496089935302734, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8498304486274719, "num_tokens": 416055648.0, "step": 10899 }, { "epoch": 1.3865920366365603, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.147708892822266, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8505337238311768, "num_tokens": 416087062.0, "step": 10900 }, { "epoch": 1.3867192469151508, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.49654006958008, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8677026033401489, "num_tokens": 416122559.0, "step": 10901 }, { "epoch": 1.3868464571937413, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.020633697509766, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8537876009941101, "num_tokens": 416155228.0, "step": 10902 }, { "epoch": 1.3869736674723319, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.526859283447266, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8473787903785706, "num_tokens": 416191190.0, "step": 10903 }, { "epoch": 1.3871008777509224, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.983341217041016, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8645513653755188, "num_tokens": 416228009.0, "step": 10904 }, { "epoch": 1.387228088029513, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.65460205078125, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8576086163520813, "num_tokens": 416263657.0, "step": 10905 }, { "epoch": 1.3873552983081032, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.43410110473633, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8535247445106506, "num_tokens": 416305187.0, "step": 10906 }, { "epoch": 1.3874825085866938, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.84463119506836, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8573851585388184, "num_tokens": 416342578.0, "step": 10907 }, { "epoch": 1.3876097188652843, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.573219299316406, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8569340705871582, "num_tokens": 416378969.0, "step": 10908 }, { "epoch": 1.3877369291438748, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.502891540527344, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8601945042610168, "num_tokens": 416412958.0, "step": 10909 }, { "epoch": 1.3878641394224653, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.88630676269531, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8409067988395691, "num_tokens": 416445643.0, "step": 10910 }, { "epoch": 1.3879913497010559, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.81229782104492, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8462693691253662, "num_tokens": 416484891.0, "step": 10911 }, { "epoch": 1.3881185599796464, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.64435577392578, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8478630185127258, "num_tokens": 416520504.0, "step": 10912 }, { "epoch": 1.388245770258237, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.997337341308594, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8599399328231812, "num_tokens": 416561877.0, "step": 10913 }, { "epoch": 1.3883729805368275, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.545440673828125, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8600452542304993, "num_tokens": 416597676.0, "step": 10914 }, { "epoch": 1.3885001908154178, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.13631057739258, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8670524954795837, "num_tokens": 416630557.0, "step": 10915 }, { "epoch": 1.3886274010940083, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.342529296875, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8620764017105103, "num_tokens": 416668604.0, "step": 10916 }, { "epoch": 1.3887546113725988, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.11480712890625, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.840248167514801, "num_tokens": 416710246.0, "step": 10917 }, { "epoch": 1.3888818216511893, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.52525329589844, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.860807478427887, "num_tokens": 416744962.0, "step": 10918 }, { "epoch": 1.3890090319297799, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.073280334472656, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8621214628219604, "num_tokens": 416779483.0, "step": 10919 }, { "epoch": 1.3891362422083704, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.30133056640625, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8734453916549683, "num_tokens": 416822135.0, "step": 10920 }, { "epoch": 1.389263452486961, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.30194091796875, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8580661416053772, "num_tokens": 416860383.0, "step": 10921 }, { "epoch": 1.3893906627655515, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.369422912597656, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.856471061706543, "num_tokens": 416896960.0, "step": 10922 }, { "epoch": 1.389517873044142, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.16154479980469, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8722023963928223, "num_tokens": 416931981.0, "step": 10923 }, { "epoch": 1.3896450833227325, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.61909866333008, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8700271844863892, "num_tokens": 416968179.0, "step": 10924 }, { "epoch": 1.389772293601323, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.868186950683594, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8643099069595337, "num_tokens": 417008879.0, "step": 10925 }, { "epoch": 1.3898995038799136, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.96705627441406, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8701896071434021, "num_tokens": 417042497.0, "step": 10926 }, { "epoch": 1.390026714158504, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.526180267333984, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8605372309684753, "num_tokens": 417084396.0, "step": 10927 }, { "epoch": 1.3901539244370946, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.71280288696289, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8721798658370972, "num_tokens": 417121014.0, "step": 10928 }, { "epoch": 1.3902811347156852, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.6584587097168, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8594621419906616, "num_tokens": 417160169.0, "step": 10929 }, { "epoch": 1.3904083449942755, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.054901123046875, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8428254723548889, "num_tokens": 417195024.0, "step": 10930 }, { "epoch": 1.390535555272866, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.468955993652344, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8664498329162598, "num_tokens": 417231344.0, "step": 10931 }, { "epoch": 1.3906627655514565, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.820613861083984, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8519655466079712, "num_tokens": 417270266.0, "step": 10932 }, { "epoch": 1.390789975830047, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.57471466064453, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8583225011825562, "num_tokens": 417308292.0, "step": 10933 }, { "epoch": 1.3909171861086376, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.69004821777344, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8557065725326538, "num_tokens": 417346358.0, "step": 10934 }, { "epoch": 1.391044396387228, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.40518569946289, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8615871071815491, "num_tokens": 417381924.0, "step": 10935 }, { "epoch": 1.3911716066658186, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.626590728759766, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.8329964876174927, "num_tokens": 417417779.0, "step": 10936 }, { "epoch": 1.3912988169444092, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.62387466430664, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8518590927124023, "num_tokens": 417459076.0, "step": 10937 }, { "epoch": 1.3914260272229997, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.68267059326172, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8594034910202026, "num_tokens": 417499422.0, "step": 10938 }, { "epoch": 1.39155323750159, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.34954071044922, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8608611822128296, "num_tokens": 417541776.0, "step": 10939 }, { "epoch": 1.3916804477801805, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.84331512451172, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8472281098365784, "num_tokens": 417576469.0, "step": 10940 }, { "epoch": 1.391807658058771, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.32522964477539, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8602331876754761, "num_tokens": 417611726.0, "step": 10941 }, { "epoch": 1.3919348683373616, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.923912048339844, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8536092042922974, "num_tokens": 417650135.0, "step": 10942 }, { "epoch": 1.392062078615952, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.089813232421875, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8407894372940063, "num_tokens": 417690981.0, "step": 10943 }, { "epoch": 1.3921892888945426, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.94071960449219, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8481964468955994, "num_tokens": 417732152.0, "step": 10944 }, { "epoch": 1.3923164991731332, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.45969009399414, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8487440347671509, "num_tokens": 417771814.0, "step": 10945 }, { "epoch": 1.3924437094517237, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.05687713623047, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8470638990402222, "num_tokens": 417810663.0, "step": 10946 }, { "epoch": 1.3925709197303142, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.74183654785156, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8648679852485657, "num_tokens": 417845664.0, "step": 10947 }, { "epoch": 1.3926981300089047, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.3127555847168, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8519923090934753, "num_tokens": 417889617.0, "step": 10948 }, { "epoch": 1.3928253402874953, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.49603271484375, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.856610894203186, "num_tokens": 417930820.0, "step": 10949 }, { "epoch": 1.3929525505660858, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.27899169921875, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8602419495582581, "num_tokens": 417966799.0, "step": 10950 }, { "epoch": 1.3930797608446763, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.015419006347656, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8509582877159119, "num_tokens": 418009328.0, "step": 10951 }, { "epoch": 1.3932069711232669, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.456661224365234, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8617469072341919, "num_tokens": 418048548.0, "step": 10952 }, { "epoch": 1.3933341814018574, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.02970504760742, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8576068878173828, "num_tokens": 418094947.0, "step": 10953 }, { "epoch": 1.393461391680448, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.62105941772461, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8502974510192871, "num_tokens": 418137144.0, "step": 10954 }, { "epoch": 1.3935886019590382, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.73117446899414, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8814259171485901, "num_tokens": 418171449.0, "step": 10955 }, { "epoch": 1.3937158122376287, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.450836181640625, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8626105189323425, "num_tokens": 418209854.0, "step": 10956 }, { "epoch": 1.3938430225162193, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.852169036865234, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8683352470397949, "num_tokens": 418245121.0, "step": 10957 }, { "epoch": 1.3939702327948098, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.1256103515625, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8631877899169922, "num_tokens": 418284158.0, "step": 10958 }, { "epoch": 1.3940974430734003, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.067047119140625, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8664108514785767, "num_tokens": 418324091.0, "step": 10959 }, { "epoch": 1.3942246533519909, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.44705581665039, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8598453402519226, "num_tokens": 418364764.0, "step": 10960 }, { "epoch": 1.3943518636305814, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.46415710449219, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8559365272521973, "num_tokens": 418403974.0, "step": 10961 }, { "epoch": 1.394479073909172, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.54063034057617, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8495185375213623, "num_tokens": 418446341.0, "step": 10962 }, { "epoch": 1.3946062841877624, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.502960205078125, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8595426082611084, "num_tokens": 418490989.0, "step": 10963 }, { "epoch": 1.3947334944663528, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.824607849121094, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8547098636627197, "num_tokens": 418524633.0, "step": 10964 }, { "epoch": 1.3948607047449433, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.66028594970703, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8579365611076355, "num_tokens": 418562576.0, "step": 10965 }, { "epoch": 1.3949879150235338, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.941986083984375, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8600438833236694, "num_tokens": 418596537.0, "step": 10966 }, { "epoch": 1.3951151253021243, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.23822784423828, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8578004240989685, "num_tokens": 418625800.0, "step": 10967 }, { "epoch": 1.3952423355807149, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.21786880493164, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8446112871170044, "num_tokens": 418663818.0, "step": 10968 }, { "epoch": 1.3953695458593054, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.559288024902344, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.864288330078125, "num_tokens": 418700607.0, "step": 10969 }, { "epoch": 1.395496756137896, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.785911560058594, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8525794744491577, "num_tokens": 418739357.0, "step": 10970 }, { "epoch": 1.3956239664164865, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.87154006958008, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8622748851776123, "num_tokens": 418769752.0, "step": 10971 }, { "epoch": 1.395751176695077, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.77560806274414, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8397407531738281, "num_tokens": 418808196.0, "step": 10972 }, { "epoch": 1.3958783869736675, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.836952209472656, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8751112222671509, "num_tokens": 418843335.0, "step": 10973 }, { "epoch": 1.396005597252258, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.76232147216797, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8692381381988525, "num_tokens": 418876591.0, "step": 10974 }, { "epoch": 1.3961328075308486, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.47910690307617, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8457123041152954, "num_tokens": 418910482.0, "step": 10975 }, { "epoch": 1.396260017809439, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.26106262207031, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8502618670463562, "num_tokens": 418942248.0, "step": 10976 }, { "epoch": 1.3963872280880296, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.94261169433594, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8653545379638672, "num_tokens": 418977770.0, "step": 10977 }, { "epoch": 1.3965144383666201, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.41116714477539, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8517435193061829, "num_tokens": 419016464.0, "step": 10978 }, { "epoch": 1.3966416486452105, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.77497863769531, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8497158885002136, "num_tokens": 419053254.0, "step": 10979 }, { "epoch": 1.396768858923801, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.3940544128418, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8544566631317139, "num_tokens": 419088940.0, "step": 10980 }, { "epoch": 1.3968960692023915, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.668617248535156, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8583285808563232, "num_tokens": 419127897.0, "step": 10981 }, { "epoch": 1.397023279480982, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.54209518432617, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8617932796478271, "num_tokens": 419170921.0, "step": 10982 }, { "epoch": 1.3971504897595726, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.154327392578125, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8652043342590332, "num_tokens": 419206032.0, "step": 10983 }, { "epoch": 1.397277700038163, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.22520065307617, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8746108412742615, "num_tokens": 419244240.0, "step": 10984 }, { "epoch": 1.3974049103167536, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.5873908996582, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8663998246192932, "num_tokens": 419274902.0, "step": 10985 }, { "epoch": 1.3975321205953442, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.2234001159668, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8710826635360718, "num_tokens": 419312295.0, "step": 10986 }, { "epoch": 1.3976593308739347, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.84386444091797, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8423093557357788, "num_tokens": 419350999.0, "step": 10987 }, { "epoch": 1.397786541152525, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.51615524291992, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8598171472549438, "num_tokens": 419386739.0, "step": 10988 }, { "epoch": 1.3979137514311155, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.88986587524414, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8656951189041138, "num_tokens": 419430353.0, "step": 10989 }, { "epoch": 1.398040961709706, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.630271911621094, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8535500168800354, "num_tokens": 419462910.0, "step": 10990 }, { "epoch": 1.3981681719882966, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.73329162597656, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8681800365447998, "num_tokens": 419498727.0, "step": 10991 }, { "epoch": 1.398295382266887, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.557308197021484, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8514325618743896, "num_tokens": 419534496.0, "step": 10992 }, { "epoch": 1.3984225925454776, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.436859130859375, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8554604053497314, "num_tokens": 419575760.0, "step": 10993 }, { "epoch": 1.3985498028240682, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.89322280883789, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8673142194747925, "num_tokens": 419616822.0, "step": 10994 }, { "epoch": 1.3986770131026587, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.62556076049805, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8672691583633423, "num_tokens": 419650425.0, "step": 10995 }, { "epoch": 1.3988042233812492, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.83443832397461, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8674356341362, "num_tokens": 419685501.0, "step": 10996 }, { "epoch": 1.3989314336598397, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.69536590576172, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8628672957420349, "num_tokens": 419724471.0, "step": 10997 }, { "epoch": 1.3990586439384303, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.86264419555664, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8681578636169434, "num_tokens": 419763683.0, "step": 10998 }, { "epoch": 1.3991858542170208, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.61323928833008, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8618814945220947, "num_tokens": 419801238.0, "step": 10999 }, { "epoch": 1.3993130644956113, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.413299560546875, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8559249043464661, "num_tokens": 419832787.0, "step": 11000 }, { "epoch": 1.3994402747742019, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.699302673339844, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8487184047698975, "num_tokens": 419864294.0, "step": 11001 }, { "epoch": 1.3995674850527924, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.581451416015625, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8633741736412048, "num_tokens": 419900493.0, "step": 11002 }, { "epoch": 1.399694695331383, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.85148239135742, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8790536522865295, "num_tokens": 419938917.0, "step": 11003 }, { "epoch": 1.3998219056099732, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.57917022705078, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8749653100967407, "num_tokens": 419978771.0, "step": 11004 }, { "epoch": 1.3999491158885637, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.67707443237305, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8590196967124939, "num_tokens": 420011836.0, "step": 11005 }, { "epoch": 1.4000763261671543, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.34623336791992, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.837998628616333, "num_tokens": 420051618.0, "step": 11006 }, { "epoch": 1.4002035364457448, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.849891662597656, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8463253378868103, "num_tokens": 420089119.0, "step": 11007 }, { "epoch": 1.4003307467243353, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.9259033203125, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8704948425292969, "num_tokens": 420123706.0, "step": 11008 }, { "epoch": 1.4004579570029259, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.65351104736328, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8566269874572754, "num_tokens": 420156723.0, "step": 11009 }, { "epoch": 1.4005851672815164, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.532318115234375, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8597120642662048, "num_tokens": 420198519.0, "step": 11010 }, { "epoch": 1.400712377560107, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.92879104614258, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8732610940933228, "num_tokens": 420231722.0, "step": 11011 }, { "epoch": 1.4008395878386974, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.69462966918945, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8632761240005493, "num_tokens": 420272059.0, "step": 11012 }, { "epoch": 1.4009667981172877, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.69887924194336, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8663400411605835, "num_tokens": 420307695.0, "step": 11013 }, { "epoch": 1.4010940083958783, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.48316955566406, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8698090314865112, "num_tokens": 420345989.0, "step": 11014 }, { "epoch": 1.4012212186744688, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.87403869628906, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8660287857055664, "num_tokens": 420387194.0, "step": 11015 }, { "epoch": 1.4013484289530593, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.43630599975586, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.87513267993927, "num_tokens": 420427608.0, "step": 11016 }, { "epoch": 1.4014756392316499, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.59611892700195, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8670912384986877, "num_tokens": 420461363.0, "step": 11017 }, { "epoch": 1.4016028495102404, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.548919677734375, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8426083326339722, "num_tokens": 420503577.0, "step": 11018 }, { "epoch": 1.401730059788831, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.734527587890625, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8710293769836426, "num_tokens": 420538852.0, "step": 11019 }, { "epoch": 1.4018572700674214, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.54644012451172, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8575701713562012, "num_tokens": 420577495.0, "step": 11020 }, { "epoch": 1.401984480346012, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.914798736572266, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8602019548416138, "num_tokens": 420616123.0, "step": 11021 }, { "epoch": 1.4021116906246025, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.529762268066406, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8706185221672058, "num_tokens": 420653904.0, "step": 11022 }, { "epoch": 1.402238900903193, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.95872116088867, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8843306303024292, "num_tokens": 420693539.0, "step": 11023 }, { "epoch": 1.4023661111817836, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.139007568359375, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8652169704437256, "num_tokens": 420729046.0, "step": 11024 }, { "epoch": 1.402493321460374, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.96316146850586, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8624422550201416, "num_tokens": 420766143.0, "step": 11025 }, { "epoch": 1.4026205317389646, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.45759582519531, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.857236385345459, "num_tokens": 420806367.0, "step": 11026 }, { "epoch": 1.4027477420175551, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.497257232666016, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8674122095108032, "num_tokens": 420845081.0, "step": 11027 }, { "epoch": 1.4028749522961454, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.62577438354492, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8661911487579346, "num_tokens": 420885833.0, "step": 11028 }, { "epoch": 1.403002162574736, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.69257354736328, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8665598630905151, "num_tokens": 420916313.0, "step": 11029 }, { "epoch": 1.4031293728533265, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.69121170043945, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8430924415588379, "num_tokens": 420957011.0, "step": 11030 }, { "epoch": 1.403256583131917, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.756282806396484, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8645431995391846, "num_tokens": 420994815.0, "step": 11031 }, { "epoch": 1.4033837934105076, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.497764587402344, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8688889145851135, "num_tokens": 421033977.0, "step": 11032 }, { "epoch": 1.403511003689098, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.55388259887695, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8653632402420044, "num_tokens": 421068871.0, "step": 11033 }, { "epoch": 1.4036382139676886, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.790645599365234, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8612217307090759, "num_tokens": 421105680.0, "step": 11034 }, { "epoch": 1.4037654242462791, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.86164093017578, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8722878694534302, "num_tokens": 421149519.0, "step": 11035 }, { "epoch": 1.4038926345248697, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.72867965698242, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8619504570960999, "num_tokens": 421190195.0, "step": 11036 }, { "epoch": 1.40401984480346, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.124446868896484, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8553926944732666, "num_tokens": 421218637.0, "step": 11037 }, { "epoch": 1.4041470550820505, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.5727653503418, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8646168112754822, "num_tokens": 421261300.0, "step": 11038 }, { "epoch": 1.404274265360641, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.786006927490234, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8567935228347778, "num_tokens": 421297250.0, "step": 11039 }, { "epoch": 1.4044014756392316, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.16672134399414, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8529163002967834, "num_tokens": 421334213.0, "step": 11040 }, { "epoch": 1.404528685917822, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.35304641723633, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8685212135314941, "num_tokens": 421368341.0, "step": 11041 }, { "epoch": 1.4046558961964126, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.05543518066406, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8495106101036072, "num_tokens": 421408278.0, "step": 11042 }, { "epoch": 1.4047831064750032, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.684444427490234, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8525253534317017, "num_tokens": 421442559.0, "step": 11043 }, { "epoch": 1.4049103167535937, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.489688873291016, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8571946024894714, "num_tokens": 421474014.0, "step": 11044 }, { "epoch": 1.4050375270321842, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.644004821777344, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8356086015701294, "num_tokens": 421511193.0, "step": 11045 }, { "epoch": 1.4051647373107747, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.722412109375, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8611744046211243, "num_tokens": 421547895.0, "step": 11046 }, { "epoch": 1.4052919475893653, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.88374710083008, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8588618636131287, "num_tokens": 421582883.0, "step": 11047 }, { "epoch": 1.4054191578679558, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.42067337036133, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8629813194274902, "num_tokens": 421619838.0, "step": 11048 }, { "epoch": 1.4055463681465463, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 40.12368392944336, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8686355948448181, "num_tokens": 421659451.0, "step": 11049 }, { "epoch": 1.4056735784251368, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.347904205322266, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.855811595916748, "num_tokens": 421704826.0, "step": 11050 }, { "epoch": 1.4058007887037274, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 40.49241256713867, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8570544719696045, "num_tokens": 421748953.0, "step": 11051 }, { "epoch": 1.405927998982318, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.887699127197266, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8436609506607056, "num_tokens": 421786614.0, "step": 11052 }, { "epoch": 1.4060552092609082, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.824893951416016, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8546613454818726, "num_tokens": 421817291.0, "step": 11053 }, { "epoch": 1.4061824195394987, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00010824203491210938, "grad_norm": 39.58821487426758, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8589270114898682, "num_tokens": 421858200.0, "step": 11054 }, { "epoch": 1.4063096298180893, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.08909225463867, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.860073447227478, "num_tokens": 421903343.0, "step": 11055 }, { "epoch": 1.4064368400966798, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.487022399902344, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8762044906616211, "num_tokens": 421941426.0, "step": 11056 }, { "epoch": 1.4065640503752703, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.29281234741211, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.856774091720581, "num_tokens": 421983992.0, "step": 11057 }, { "epoch": 1.4066912606538609, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 40.02632522583008, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8690004348754883, "num_tokens": 422015066.0, "step": 11058 }, { "epoch": 1.4068184709324514, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.046329498291016, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8640699982643127, "num_tokens": 422052392.0, "step": 11059 }, { "epoch": 1.406945681211042, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.43452072143555, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.856376588344574, "num_tokens": 422091453.0, "step": 11060 }, { "epoch": 1.4070728914896324, "ewc_loss": 0.12890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001087188720703125, "grad_norm": 39.764610290527344, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8447697162628174, "num_tokens": 422131019.0, "step": 11061 }, { "epoch": 1.4072001017682227, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.39396286010742, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.853326678276062, "num_tokens": 422172139.0, "step": 11062 }, { "epoch": 1.4073273120468133, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.03192138671875, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.867085874080658, "num_tokens": 422209708.0, "step": 11063 }, { "epoch": 1.4074545223254038, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.166481018066406, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8619215488433838, "num_tokens": 422245091.0, "step": 11064 }, { "epoch": 1.4075817326039943, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.016746520996094, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8611542582511902, "num_tokens": 422280054.0, "step": 11065 }, { "epoch": 1.4077089428825849, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.08951187133789, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8659445643424988, "num_tokens": 422310036.0, "step": 11066 }, { "epoch": 1.4078361531611754, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.293975830078125, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8436782956123352, "num_tokens": 422347914.0, "step": 11067 }, { "epoch": 1.407963363439766, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.70085906982422, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8619958758354187, "num_tokens": 422383207.0, "step": 11068 }, { "epoch": 1.4080905737183564, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.465187072753906, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8629635572433472, "num_tokens": 422414675.0, "step": 11069 }, { "epoch": 1.408217783996947, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.83494567871094, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8493922352790833, "num_tokens": 422449746.0, "step": 11070 }, { "epoch": 1.4083449942755375, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.44804763793945, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8531774282455444, "num_tokens": 422487789.0, "step": 11071 }, { "epoch": 1.408472204554128, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.16770935058594, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8649161458015442, "num_tokens": 422521750.0, "step": 11072 }, { "epoch": 1.4085994148327186, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.636985778808594, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8699254393577576, "num_tokens": 422562923.0, "step": 11073 }, { "epoch": 1.408726625111309, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.823516845703125, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8493728637695312, "num_tokens": 422606832.0, "step": 11074 }, { "epoch": 1.4088538353898996, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.046844482421875, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8626694083213806, "num_tokens": 422644465.0, "step": 11075 }, { "epoch": 1.4089810456684901, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.915645599365234, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8678157925605774, "num_tokens": 422681204.0, "step": 11076 }, { "epoch": 1.4091082559470804, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.89596176147461, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8496348261833191, "num_tokens": 422716325.0, "step": 11077 }, { "epoch": 1.409235466225671, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.41191864013672, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8500164747238159, "num_tokens": 422753644.0, "step": 11078 }, { "epoch": 1.4093626765042615, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.113685607910156, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8603962659835815, "num_tokens": 422790893.0, "step": 11079 }, { "epoch": 1.409489886782852, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.961578369140625, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8522410988807678, "num_tokens": 422829400.0, "step": 11080 }, { "epoch": 1.4096170970614426, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.85090637207031, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.869253933429718, "num_tokens": 422867386.0, "step": 11081 }, { "epoch": 1.409744307340033, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.64311599731445, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.864486813545227, "num_tokens": 422910736.0, "step": 11082 }, { "epoch": 1.4098715176186236, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.031455993652344, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8641355037689209, "num_tokens": 422950302.0, "step": 11083 }, { "epoch": 1.4099987278972141, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.805938720703125, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8590574264526367, "num_tokens": 422985892.0, "step": 11084 }, { "epoch": 1.4101259381758047, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.07118225097656, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.852310299873352, "num_tokens": 423020075.0, "step": 11085 }, { "epoch": 1.410253148454395, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.4387321472168, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8624497652053833, "num_tokens": 423058260.0, "step": 11086 }, { "epoch": 1.4103803587329855, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.368621826171875, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8838536739349365, "num_tokens": 423092158.0, "step": 11087 }, { "epoch": 1.410507569011576, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.73466491699219, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8573430776596069, "num_tokens": 423123999.0, "step": 11088 }, { "epoch": 1.4106347792901666, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.932594299316406, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8574433326721191, "num_tokens": 423167111.0, "step": 11089 }, { "epoch": 1.410761989568757, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.67450714111328, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8526725172996521, "num_tokens": 423202060.0, "step": 11090 }, { "epoch": 1.4108891998473476, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.80585479736328, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8517162799835205, "num_tokens": 423241200.0, "step": 11091 }, { "epoch": 1.4110164101259381, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.94939422607422, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8706350326538086, "num_tokens": 423284713.0, "step": 11092 }, { "epoch": 1.4111436204045287, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.50005340576172, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8619014024734497, "num_tokens": 423322524.0, "step": 11093 }, { "epoch": 1.4112708306831192, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.386226654052734, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8608524799346924, "num_tokens": 423360551.0, "step": 11094 }, { "epoch": 1.4113980409617097, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.67802429199219, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8699043393135071, "num_tokens": 423395227.0, "step": 11095 }, { "epoch": 1.4115252512403003, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.257205963134766, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8471605777740479, "num_tokens": 423434757.0, "step": 11096 }, { "epoch": 1.4116524615188908, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.17121505737305, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8786537647247314, "num_tokens": 423464599.0, "step": 11097 }, { "epoch": 1.4117796717974813, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.97378158569336, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8530459403991699, "num_tokens": 423504327.0, "step": 11098 }, { "epoch": 1.4119068820760718, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.220367431640625, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8424398899078369, "num_tokens": 423543114.0, "step": 11099 }, { "epoch": 1.4120340923546624, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.33353042602539, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8491004109382629, "num_tokens": 423581727.0, "step": 11100 }, { "epoch": 1.412161302633253, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.985939025878906, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8658206462860107, "num_tokens": 423617073.0, "step": 11101 }, { "epoch": 1.4122885129118432, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.109371185302734, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8658329248428345, "num_tokens": 423664922.0, "step": 11102 }, { "epoch": 1.4124157231904337, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.09418869018555, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8518205285072327, "num_tokens": 423706990.0, "step": 11103 }, { "epoch": 1.4125429334690243, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.08915710449219, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8436898589134216, "num_tokens": 423745945.0, "step": 11104 }, { "epoch": 1.4126701437476148, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 40.105838775634766, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8649966716766357, "num_tokens": 423780824.0, "step": 11105 }, { "epoch": 1.4127973540262053, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 40.082759857177734, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8734747171401978, "num_tokens": 423818640.0, "step": 11106 }, { "epoch": 1.4129245643047958, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.1216926574707, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8687849640846252, "num_tokens": 423856657.0, "step": 11107 }, { "epoch": 1.4130517745833864, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.79909896850586, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8716115951538086, "num_tokens": 423900288.0, "step": 11108 }, { "epoch": 1.413178984861977, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.16393280029297, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8666398525238037, "num_tokens": 423935929.0, "step": 11109 }, { "epoch": 1.4133061951405674, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.89421844482422, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.858618974685669, "num_tokens": 423970719.0, "step": 11110 }, { "epoch": 1.4134334054191577, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.058998107910156, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8609397411346436, "num_tokens": 424012338.0, "step": 11111 }, { "epoch": 1.4135606156977483, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.14748001098633, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8415655493736267, "num_tokens": 424050112.0, "step": 11112 }, { "epoch": 1.4136878259763388, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 40.50223159790039, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8602405190467834, "num_tokens": 424092688.0, "step": 11113 }, { "epoch": 1.4138150362549293, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.674442291259766, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8582018613815308, "num_tokens": 424131355.0, "step": 11114 }, { "epoch": 1.4139422465335199, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.553489685058594, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8494714498519897, "num_tokens": 424177768.0, "step": 11115 }, { "epoch": 1.4140694568121104, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010919570922851562, "grad_norm": 39.584022521972656, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8735677003860474, "num_tokens": 424215834.0, "step": 11116 }, { "epoch": 1.414196667090701, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.490150451660156, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8643687963485718, "num_tokens": 424260002.0, "step": 11117 }, { "epoch": 1.4143238773692914, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.64228057861328, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8579555749893188, "num_tokens": 424299343.0, "step": 11118 }, { "epoch": 1.414451087647882, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.58642578125, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.853751540184021, "num_tokens": 424340312.0, "step": 11119 }, { "epoch": 1.4145782979264725, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011014938354492188, "grad_norm": 39.53158950805664, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8701578378677368, "num_tokens": 424376658.0, "step": 11120 }, { "epoch": 1.414705508205063, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.40571212768555, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8705748319625854, "num_tokens": 424415499.0, "step": 11121 }, { "epoch": 1.4148327184836536, "ewc_loss": 0.1298828125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00010967254638671875, "grad_norm": 39.70584487915039, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8616508841514587, "num_tokens": 424453401.0, "step": 11122 }, { "epoch": 1.414959928762244, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.40802764892578, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8647468686103821, "num_tokens": 424490654.0, "step": 11123 }, { "epoch": 1.4150871390408346, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.744441986083984, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.869496762752533, "num_tokens": 424534469.0, "step": 11124 }, { "epoch": 1.4152143493194251, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.243682861328125, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8521959781646729, "num_tokens": 424570187.0, "step": 11125 }, { "epoch": 1.4153415595980154, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.934818267822266, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8405905961990356, "num_tokens": 424607898.0, "step": 11126 }, { "epoch": 1.415468769876606, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.5296516418457, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8643490076065063, "num_tokens": 424652156.0, "step": 11127 }, { "epoch": 1.4155959801551965, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.596046447753906, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8509853482246399, "num_tokens": 424688735.0, "step": 11128 }, { "epoch": 1.415723190433787, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.67518615722656, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8619688749313354, "num_tokens": 424732859.0, "step": 11129 }, { "epoch": 1.4158504007123776, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.8161506652832, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8540734052658081, "num_tokens": 424769381.0, "step": 11130 }, { "epoch": 1.415977610990968, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.70914077758789, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8599809408187866, "num_tokens": 424802362.0, "step": 11131 }, { "epoch": 1.4161048212695586, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.44475173950195, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8559267520904541, "num_tokens": 424837897.0, "step": 11132 }, { "epoch": 1.4162320315481491, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.89836502075195, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8707594871520996, "num_tokens": 424878154.0, "step": 11133 }, { "epoch": 1.4163592418267397, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.351016998291016, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8643709421157837, "num_tokens": 424915063.0, "step": 11134 }, { "epoch": 1.41648645210533, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.914608001708984, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8605992794036865, "num_tokens": 424955159.0, "step": 11135 }, { "epoch": 1.4166136623839205, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.58907699584961, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8521285057067871, "num_tokens": 424992573.0, "step": 11136 }, { "epoch": 1.416740872662511, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.227996826171875, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8639727830886841, "num_tokens": 425030452.0, "step": 11137 }, { "epoch": 1.4168680829411016, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.586769104003906, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8745961785316467, "num_tokens": 425067118.0, "step": 11138 }, { "epoch": 1.416995293219692, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.272518157958984, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8497629165649414, "num_tokens": 425101684.0, "step": 11139 }, { "epoch": 1.4171225034982826, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.25178909301758, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8697439432144165, "num_tokens": 425143127.0, "step": 11140 }, { "epoch": 1.4172497137768731, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.872825622558594, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8589895963668823, "num_tokens": 425173912.0, "step": 11141 }, { "epoch": 1.4173769240554637, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.31742477416992, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8650135397911072, "num_tokens": 425210146.0, "step": 11142 }, { "epoch": 1.4175041343340542, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.72663879394531, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8741662502288818, "num_tokens": 425246082.0, "step": 11143 }, { "epoch": 1.4176313446126447, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.169334411621094, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8582198619842529, "num_tokens": 425287470.0, "step": 11144 }, { "epoch": 1.4177585548912353, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.736175537109375, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8571715354919434, "num_tokens": 425326553.0, "step": 11145 }, { "epoch": 1.4178857651698258, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.99763107299805, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8556196689605713, "num_tokens": 425363105.0, "step": 11146 }, { "epoch": 1.4180129754484163, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.570213317871094, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8565642833709717, "num_tokens": 425402513.0, "step": 11147 }, { "epoch": 1.4181401857270068, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.93594741821289, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8612878322601318, "num_tokens": 425440132.0, "step": 11148 }, { "epoch": 1.4182673960055974, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.75123596191406, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8598400354385376, "num_tokens": 425481738.0, "step": 11149 }, { "epoch": 1.418394606284188, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.207916259765625, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8588496446609497, "num_tokens": 425516916.0, "step": 11150 }, { "epoch": 1.4185218165627782, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.780818939208984, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8604463338851929, "num_tokens": 425555951.0, "step": 11151 }, { "epoch": 1.4186490268413687, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.03281021118164, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8606314659118652, "num_tokens": 425595302.0, "step": 11152 }, { "epoch": 1.4187762371199593, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.773738861083984, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8649842739105225, "num_tokens": 425623281.0, "step": 11153 }, { "epoch": 1.4189034473985498, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.64882278442383, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8704777956008911, "num_tokens": 425659124.0, "step": 11154 }, { "epoch": 1.4190306576771403, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.384090423583984, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8485760688781738, "num_tokens": 425695809.0, "step": 11155 }, { "epoch": 1.4191578679557308, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.68267059326172, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8505744934082031, "num_tokens": 425731890.0, "step": 11156 }, { "epoch": 1.4192850782343214, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.16071701049805, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8683217763900757, "num_tokens": 425770899.0, "step": 11157 }, { "epoch": 1.419412288512912, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.71173095703125, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.87285315990448, "num_tokens": 425807812.0, "step": 11158 }, { "epoch": 1.4195394987915024, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 39.89933395385742, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8558327555656433, "num_tokens": 425848997.0, "step": 11159 }, { "epoch": 1.4196667090700927, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.88658905029297, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8698443174362183, "num_tokens": 425887816.0, "step": 11160 }, { "epoch": 1.4197939193486833, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.77895736694336, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8517196178436279, "num_tokens": 425925488.0, "step": 11161 }, { "epoch": 1.4199211296272738, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.147552490234375, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8660107851028442, "num_tokens": 425965616.0, "step": 11162 }, { "epoch": 1.4200483399058643, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.997467041015625, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8550877571105957, "num_tokens": 426005308.0, "step": 11163 }, { "epoch": 1.4201755501844548, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.044219970703125, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8583027124404907, "num_tokens": 426042339.0, "step": 11164 }, { "epoch": 1.4203027604630454, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.917457580566406, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.860518753528595, "num_tokens": 426081925.0, "step": 11165 }, { "epoch": 1.420429970741636, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.459510803222656, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8550674915313721, "num_tokens": 426120918.0, "step": 11166 }, { "epoch": 1.4205571810202264, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.016685485839844, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8490819931030273, "num_tokens": 426158711.0, "step": 11167 }, { "epoch": 1.420684391298817, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.03476333618164, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8566320538520813, "num_tokens": 426197034.0, "step": 11168 }, { "epoch": 1.4208116015774075, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.92533874511719, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8532496094703674, "num_tokens": 426239390.0, "step": 11169 }, { "epoch": 1.420938811855998, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.18833541870117, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8492044806480408, "num_tokens": 426285488.0, "step": 11170 }, { "epoch": 1.4210660221345885, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.65104293823242, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8475269079208374, "num_tokens": 426323955.0, "step": 11171 }, { "epoch": 1.421193232413179, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.4915657043457, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8794832229614258, "num_tokens": 426359460.0, "step": 11172 }, { "epoch": 1.4213204426917696, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.58110427856445, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8512985706329346, "num_tokens": 426397104.0, "step": 11173 }, { "epoch": 1.4214476529703601, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.08719253540039, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8692821264266968, "num_tokens": 426441349.0, "step": 11174 }, { "epoch": 1.4215748632489504, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.02912139892578, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8683881759643555, "num_tokens": 426478410.0, "step": 11175 }, { "epoch": 1.421702073527541, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.38517761230469, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8590837717056274, "num_tokens": 426519874.0, "step": 11176 }, { "epoch": 1.4218292838061315, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.24308395385742, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8478893041610718, "num_tokens": 426559970.0, "step": 11177 }, { "epoch": 1.421956494084722, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.97275161743164, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.862127423286438, "num_tokens": 426593897.0, "step": 11178 }, { "epoch": 1.4220837043633126, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.26832962036133, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8609445691108704, "num_tokens": 426633712.0, "step": 11179 }, { "epoch": 1.422210914641903, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.964412689208984, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8760807514190674, "num_tokens": 426669085.0, "step": 11180 }, { "epoch": 1.4223381249204936, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.87160110473633, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8751533627510071, "num_tokens": 426709260.0, "step": 11181 }, { "epoch": 1.4224653351990841, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.81413269042969, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8497663736343384, "num_tokens": 426747747.0, "step": 11182 }, { "epoch": 1.4225925454776747, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.58003234863281, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8480150103569031, "num_tokens": 426788185.0, "step": 11183 }, { "epoch": 1.422719755756265, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.02501678466797, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8619191646575928, "num_tokens": 426828663.0, "step": 11184 }, { "epoch": 1.4228469660348555, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.72421646118164, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8492490649223328, "num_tokens": 426869205.0, "step": 11185 }, { "epoch": 1.422974176313446, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.81907653808594, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8754607439041138, "num_tokens": 426900561.0, "step": 11186 }, { "epoch": 1.4231013865920366, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.24087142944336, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8746964931488037, "num_tokens": 426940618.0, "step": 11187 }, { "epoch": 1.423228596870627, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.079551696777344, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8515105843544006, "num_tokens": 426975509.0, "step": 11188 }, { "epoch": 1.4233558071492176, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 39.915409088134766, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8691744804382324, "num_tokens": 427011647.0, "step": 11189 }, { "epoch": 1.4234830174278081, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.06051254272461, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8571679592132568, "num_tokens": 427046631.0, "step": 11190 }, { "epoch": 1.4236102277063987, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.148895263671875, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8582695722579956, "num_tokens": 427081364.0, "step": 11191 }, { "epoch": 1.4237374379849892, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.972618103027344, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8629759550094604, "num_tokens": 427118667.0, "step": 11192 }, { "epoch": 1.4238646482635797, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.231422424316406, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8644869327545166, "num_tokens": 427155017.0, "step": 11193 }, { "epoch": 1.4239918585421703, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.34746170043945, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8529365062713623, "num_tokens": 427191741.0, "step": 11194 }, { "epoch": 1.4241190688207608, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.05916976928711, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8644612431526184, "num_tokens": 427226210.0, "step": 11195 }, { "epoch": 1.4242462790993513, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.98678970336914, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8571635484695435, "num_tokens": 427261699.0, "step": 11196 }, { "epoch": 1.4243734893779418, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.987369537353516, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8529602289199829, "num_tokens": 427297158.0, "step": 11197 }, { "epoch": 1.4245006996565324, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.16365432739258, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.844070315361023, "num_tokens": 427338234.0, "step": 11198 }, { "epoch": 1.424627909935123, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.31958770751953, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8547928929328918, "num_tokens": 427376727.0, "step": 11199 }, { "epoch": 1.4247551202137132, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.92042922973633, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8737293481826782, "num_tokens": 427417095.0, "step": 11200 }, { "epoch": 1.4248823304923037, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.394657135009766, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8443391919136047, "num_tokens": 427453243.0, "step": 11201 }, { "epoch": 1.4250095407708943, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.83613204956055, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8635838031768799, "num_tokens": 427490961.0, "step": 11202 }, { "epoch": 1.4251367510494848, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.44282531738281, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8708377480506897, "num_tokens": 427521487.0, "step": 11203 }, { "epoch": 1.4252639613280753, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.85157012939453, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8576186895370483, "num_tokens": 427556607.0, "step": 11204 }, { "epoch": 1.4253911716066658, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.291259765625, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8610310554504395, "num_tokens": 427598940.0, "step": 11205 }, { "epoch": 1.4255183818852564, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.93793487548828, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8501806259155273, "num_tokens": 427641337.0, "step": 11206 }, { "epoch": 1.425645592163847, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.62956237792969, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8732871413230896, "num_tokens": 427680235.0, "step": 11207 }, { "epoch": 1.4257728024424374, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.269798278808594, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8612713813781738, "num_tokens": 427719918.0, "step": 11208 }, { "epoch": 1.4259000127210277, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.448795318603516, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8630235195159912, "num_tokens": 427758042.0, "step": 11209 }, { "epoch": 1.4260272229996183, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.129947662353516, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8630166053771973, "num_tokens": 427794591.0, "step": 11210 }, { "epoch": 1.4261544332782088, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.44760513305664, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.868037760257721, "num_tokens": 427828190.0, "step": 11211 }, { "epoch": 1.4262816435567993, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.982154846191406, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8532299995422363, "num_tokens": 427870363.0, "step": 11212 }, { "epoch": 1.4264088538353898, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.24998092651367, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8759729862213135, "num_tokens": 427910037.0, "step": 11213 }, { "epoch": 1.4265360641139804, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.988460540771484, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8599151968955994, "num_tokens": 427945782.0, "step": 11214 }, { "epoch": 1.426663274392571, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.302677154541016, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.867673397064209, "num_tokens": 427984635.0, "step": 11215 }, { "epoch": 1.4267904846711614, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.587608337402344, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8766137361526489, "num_tokens": 428024289.0, "step": 11216 }, { "epoch": 1.426917694949752, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.84103012084961, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8526914119720459, "num_tokens": 428067119.0, "step": 11217 }, { "epoch": 1.4270449052283425, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.47528839111328, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8581916093826294, "num_tokens": 428108150.0, "step": 11218 }, { "epoch": 1.427172115506933, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.1961669921875, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8711923360824585, "num_tokens": 428150839.0, "step": 11219 }, { "epoch": 1.4272993257855235, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.5993537902832, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8486863374710083, "num_tokens": 428194325.0, "step": 11220 }, { "epoch": 1.427426536064114, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.25262451171875, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8588008880615234, "num_tokens": 428230879.0, "step": 11221 }, { "epoch": 1.4275537463427046, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.72403335571289, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8724387884140015, "num_tokens": 428265795.0, "step": 11222 }, { "epoch": 1.4276809566212951, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.962120056152344, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8663784861564636, "num_tokens": 428300413.0, "step": 11223 }, { "epoch": 1.4278081668998854, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.27503204345703, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8818678855895996, "num_tokens": 428339293.0, "step": 11224 }, { "epoch": 1.427935377178476, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.914363861083984, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8737180233001709, "num_tokens": 428374158.0, "step": 11225 }, { "epoch": 1.4280625874570665, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.01096725463867, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.862095832824707, "num_tokens": 428411715.0, "step": 11226 }, { "epoch": 1.428189797735657, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.24922180175781, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8486902713775635, "num_tokens": 428450000.0, "step": 11227 }, { "epoch": 1.4283170080142475, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.69416046142578, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8663058876991272, "num_tokens": 428484546.0, "step": 11228 }, { "epoch": 1.428444218292838, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.92054748535156, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8616930246353149, "num_tokens": 428527519.0, "step": 11229 }, { "epoch": 1.4285714285714286, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.81831741333008, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8565905690193176, "num_tokens": 428568744.0, "step": 11230 }, { "epoch": 1.4286986388500191, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.95132827758789, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.856857419013977, "num_tokens": 428613050.0, "step": 11231 }, { "epoch": 1.4288258491286097, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.94009017944336, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8581032752990723, "num_tokens": 428647334.0, "step": 11232 }, { "epoch": 1.4289530594072, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.51200485229492, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8677813410758972, "num_tokens": 428681015.0, "step": 11233 }, { "epoch": 1.4290802696857905, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.856605529785156, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8547450304031372, "num_tokens": 428718903.0, "step": 11234 }, { "epoch": 1.429207479964381, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.207916259765625, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8703739643096924, "num_tokens": 428756038.0, "step": 11235 }, { "epoch": 1.4293346902429716, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.15032958984375, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8777258396148682, "num_tokens": 428793908.0, "step": 11236 }, { "epoch": 1.429461900521562, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.10904312133789, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8546923398971558, "num_tokens": 428830924.0, "step": 11237 }, { "epoch": 1.4295891108001526, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.08483123779297, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8592019081115723, "num_tokens": 428869083.0, "step": 11238 }, { "epoch": 1.4297163210787431, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.04314422607422, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8360259532928467, "num_tokens": 428903970.0, "step": 11239 }, { "epoch": 1.4298435313573337, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.25852966308594, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8644143342971802, "num_tokens": 428940710.0, "step": 11240 }, { "epoch": 1.4299707416359242, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 39.90882873535156, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8635636568069458, "num_tokens": 428981160.0, "step": 11241 }, { "epoch": 1.4300979519145147, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.10370635986328, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8614447116851807, "num_tokens": 429022464.0, "step": 11242 }, { "epoch": 1.4302251621931052, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.7149543762207, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.862601637840271, "num_tokens": 429060964.0, "step": 11243 }, { "epoch": 1.4303523724716958, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.61644744873047, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.863351047039032, "num_tokens": 429101389.0, "step": 11244 }, { "epoch": 1.4304795827502863, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.10893630981445, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8570047616958618, "num_tokens": 429135547.0, "step": 11245 }, { "epoch": 1.4306067930288768, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.12097930908203, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8643388748168945, "num_tokens": 429177504.0, "step": 11246 }, { "epoch": 1.4307340033074674, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.29092025756836, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.864835798740387, "num_tokens": 429217923.0, "step": 11247 }, { "epoch": 1.430861213586058, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.01803970336914, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8600600957870483, "num_tokens": 429251514.0, "step": 11248 }, { "epoch": 1.4309884238646482, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.29399108886719, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8452905416488647, "num_tokens": 429291747.0, "step": 11249 }, { "epoch": 1.4311156341432387, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.29108428955078, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8587004542350769, "num_tokens": 429333480.0, "step": 11250 }, { "epoch": 1.4312428444218293, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.377777099609375, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.873136043548584, "num_tokens": 429366482.0, "step": 11251 }, { "epoch": 1.4313700547004198, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.03423309326172, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.852998673915863, "num_tokens": 429400704.0, "step": 11252 }, { "epoch": 1.4314972649790103, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.0898551940918, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8492576479911804, "num_tokens": 429437463.0, "step": 11253 }, { "epoch": 1.4316244752576008, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.07469940185547, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8497699499130249, "num_tokens": 429474496.0, "step": 11254 }, { "epoch": 1.4317516855361914, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.54698944091797, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8472412824630737, "num_tokens": 429514641.0, "step": 11255 }, { "epoch": 1.431878895814782, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.876800537109375, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8608653545379639, "num_tokens": 429554054.0, "step": 11256 }, { "epoch": 1.4320061060933724, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.6987190246582, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8579114675521851, "num_tokens": 429594004.0, "step": 11257 }, { "epoch": 1.4321333163719627, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.95451736450195, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8676894903182983, "num_tokens": 429633019.0, "step": 11258 }, { "epoch": 1.4322605266505533, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 41.07546615600586, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8618882894515991, "num_tokens": 429666869.0, "step": 11259 }, { "epoch": 1.4323877369291438, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.01948547363281, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8712899088859558, "num_tokens": 429700049.0, "step": 11260 }, { "epoch": 1.4325149472077343, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 41.001365661621094, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8576027154922485, "num_tokens": 429737517.0, "step": 11261 }, { "epoch": 1.4326421574863248, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.15951156616211, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8741296529769897, "num_tokens": 429778697.0, "step": 11262 }, { "epoch": 1.4327693677649154, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.65321731567383, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8615687489509583, "num_tokens": 429811883.0, "step": 11263 }, { "epoch": 1.432896578043506, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.0734977722168, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.866080641746521, "num_tokens": 429851748.0, "step": 11264 }, { "epoch": 1.4330237883220964, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.872615814208984, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8507760763168335, "num_tokens": 429891033.0, "step": 11265 }, { "epoch": 1.433150998600687, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.16681671142578, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8632285594940186, "num_tokens": 429935980.0, "step": 11266 }, { "epoch": 1.4332782088792775, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.741573333740234, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8491776585578918, "num_tokens": 429970874.0, "step": 11267 }, { "epoch": 1.433405419157868, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.16960525512695, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8458846807479858, "num_tokens": 430017909.0, "step": 11268 }, { "epoch": 1.4335326294364585, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.45669937133789, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8353875279426575, "num_tokens": 430050641.0, "step": 11269 }, { "epoch": 1.433659839715049, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.32216262817383, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8649114966392517, "num_tokens": 430082985.0, "step": 11270 }, { "epoch": 1.4337870499936396, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.61562728881836, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8609433174133301, "num_tokens": 430122340.0, "step": 11271 }, { "epoch": 1.4339142602722301, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.432891845703125, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8465245962142944, "num_tokens": 430162140.0, "step": 11272 }, { "epoch": 1.4340414705508204, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.51898956298828, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8729186654090881, "num_tokens": 430201672.0, "step": 11273 }, { "epoch": 1.434168680829411, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.45909118652344, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.850286602973938, "num_tokens": 430239760.0, "step": 11274 }, { "epoch": 1.4342958911080015, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.656253814697266, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8434553146362305, "num_tokens": 430280527.0, "step": 11275 }, { "epoch": 1.434423101386592, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.91128921508789, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8574888706207275, "num_tokens": 430312788.0, "step": 11276 }, { "epoch": 1.4345503116651825, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.59659957885742, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.868322491645813, "num_tokens": 430348044.0, "step": 11277 }, { "epoch": 1.434677521943773, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.90669631958008, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8772525191307068, "num_tokens": 430385676.0, "step": 11278 }, { "epoch": 1.4348047322223636, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.83570861816406, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8570890426635742, "num_tokens": 430416976.0, "step": 11279 }, { "epoch": 1.4349319425009541, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.8408203125, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8723238706588745, "num_tokens": 430460002.0, "step": 11280 }, { "epoch": 1.4350591527795447, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.86547088623047, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8373642563819885, "num_tokens": 430503172.0, "step": 11281 }, { "epoch": 1.435186363058135, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.631065368652344, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8477397561073303, "num_tokens": 430541221.0, "step": 11282 }, { "epoch": 1.4353135733367255, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.45635223388672, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8531233072280884, "num_tokens": 430579527.0, "step": 11283 }, { "epoch": 1.435440783615316, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.55514907836914, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8612111806869507, "num_tokens": 430617263.0, "step": 11284 }, { "epoch": 1.4355679938939065, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.35712814331055, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8730882406234741, "num_tokens": 430649961.0, "step": 11285 }, { "epoch": 1.435695204172497, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.64435577392578, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8542850613594055, "num_tokens": 430682599.0, "step": 11286 }, { "epoch": 1.4358224144510876, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.615047454833984, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8330940008163452, "num_tokens": 430721014.0, "step": 11287 }, { "epoch": 1.4359496247296781, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.665367126464844, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8595094680786133, "num_tokens": 430757014.0, "step": 11288 }, { "epoch": 1.4360768350082687, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.039310455322266, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8586819767951965, "num_tokens": 430796169.0, "step": 11289 }, { "epoch": 1.4362040452868592, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.09242630004883, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8597872853279114, "num_tokens": 430832726.0, "step": 11290 }, { "epoch": 1.4363312555654497, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.23494338989258, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8496534824371338, "num_tokens": 430866343.0, "step": 11291 }, { "epoch": 1.4364584658440402, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 39.97562789916992, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8658769726753235, "num_tokens": 430901622.0, "step": 11292 }, { "epoch": 1.4365856761226308, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.46026611328125, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8739773035049438, "num_tokens": 430937499.0, "step": 11293 }, { "epoch": 1.4367128864012213, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.96873474121094, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8474685549736023, "num_tokens": 430974048.0, "step": 11294 }, { "epoch": 1.4368400966798118, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.330631256103516, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8676543235778809, "num_tokens": 431010927.0, "step": 11295 }, { "epoch": 1.4369673069584024, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.33539962768555, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8447139263153076, "num_tokens": 431051731.0, "step": 11296 }, { "epoch": 1.4370945172369929, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.804176330566406, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8759174346923828, "num_tokens": 431087876.0, "step": 11297 }, { "epoch": 1.4372217275155832, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.487144470214844, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8535501956939697, "num_tokens": 431126217.0, "step": 11298 }, { "epoch": 1.4373489377941737, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.107913970947266, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8605787754058838, "num_tokens": 431168049.0, "step": 11299 }, { "epoch": 1.4374761480727642, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.22688674926758, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8584481477737427, "num_tokens": 431211143.0, "step": 11300 }, { "epoch": 1.4376033583513548, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.30375289916992, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8667577505111694, "num_tokens": 431255747.0, "step": 11301 }, { "epoch": 1.4377305686299453, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 39.82334518432617, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8628156185150146, "num_tokens": 431296618.0, "step": 11302 }, { "epoch": 1.4378577789085358, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.20391082763672, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8617945909500122, "num_tokens": 431336493.0, "step": 11303 }, { "epoch": 1.4379849891871264, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.01607131958008, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8571795225143433, "num_tokens": 431376200.0, "step": 11304 }, { "epoch": 1.438112199465717, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.083797454833984, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8399054408073425, "num_tokens": 431414015.0, "step": 11305 }, { "epoch": 1.4382394097443074, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.05835723876953, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8567805290222168, "num_tokens": 431453498.0, "step": 11306 }, { "epoch": 1.4383666200228977, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.35194778442383, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8611094355583191, "num_tokens": 431491126.0, "step": 11307 }, { "epoch": 1.4384938303014883, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.06563186645508, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8835042715072632, "num_tokens": 431528549.0, "step": 11308 }, { "epoch": 1.4386210405800788, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.180973052978516, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8679372668266296, "num_tokens": 431569435.0, "step": 11309 }, { "epoch": 1.4387482508586693, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.11137771606445, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8655043840408325, "num_tokens": 431606125.0, "step": 11310 }, { "epoch": 1.4388754611372598, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.92238235473633, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8540450930595398, "num_tokens": 431643517.0, "step": 11311 }, { "epoch": 1.4390026714158504, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.0848274230957, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8691633939743042, "num_tokens": 431682058.0, "step": 11312 }, { "epoch": 1.439129881694441, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.166290283203125, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8721184134483337, "num_tokens": 431721241.0, "step": 11313 }, { "epoch": 1.4392570919730314, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 39.83372116088867, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8716472387313843, "num_tokens": 431757809.0, "step": 11314 }, { "epoch": 1.439384302251622, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.05180358886719, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8484344482421875, "num_tokens": 431795931.0, "step": 11315 }, { "epoch": 1.4395115125302125, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.23229217529297, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8693573474884033, "num_tokens": 431830517.0, "step": 11316 }, { "epoch": 1.439638722808803, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.133636474609375, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8625264763832092, "num_tokens": 431866047.0, "step": 11317 }, { "epoch": 1.4397659330873935, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 39.70805740356445, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8485381603240967, "num_tokens": 431901602.0, "step": 11318 }, { "epoch": 1.439893143365984, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.38652420043945, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8549771308898926, "num_tokens": 431934989.0, "step": 11319 }, { "epoch": 1.4400203536445746, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.94853973388672, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8661741614341736, "num_tokens": 431968664.0, "step": 11320 }, { "epoch": 1.4401475639231651, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.147003173828125, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8718820810317993, "num_tokens": 432008025.0, "step": 11321 }, { "epoch": 1.4402747742017554, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.218753814697266, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8623061180114746, "num_tokens": 432043526.0, "step": 11322 }, { "epoch": 1.440401984480346, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.56953430175781, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8622159361839294, "num_tokens": 432085175.0, "step": 11323 }, { "epoch": 1.4405291947589365, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.78927230834961, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8513491153717041, "num_tokens": 432122379.0, "step": 11324 }, { "epoch": 1.440656405037527, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.699588775634766, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8634169697761536, "num_tokens": 432157690.0, "step": 11325 }, { "epoch": 1.4407836153161175, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.51487350463867, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8695588111877441, "num_tokens": 432188885.0, "step": 11326 }, { "epoch": 1.440910825594708, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.98591995239258, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8615098595619202, "num_tokens": 432228804.0, "step": 11327 }, { "epoch": 1.4410380358732986, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.31009292602539, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8431905508041382, "num_tokens": 432264150.0, "step": 11328 }, { "epoch": 1.4411652461518891, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.2862663269043, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8534756898880005, "num_tokens": 432306759.0, "step": 11329 }, { "epoch": 1.4412924564304797, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 39.96261215209961, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8711131811141968, "num_tokens": 432346928.0, "step": 11330 }, { "epoch": 1.44141966670907, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.11174011230469, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8685961365699768, "num_tokens": 432386630.0, "step": 11331 }, { "epoch": 1.4415468769876605, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.089420318603516, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8474474549293518, "num_tokens": 432427331.0, "step": 11332 }, { "epoch": 1.441674087266251, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 39.720703125, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8517367243766785, "num_tokens": 432462901.0, "step": 11333 }, { "epoch": 1.4418012975448415, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.456172943115234, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8575097322463989, "num_tokens": 432494232.0, "step": 11334 }, { "epoch": 1.441928507823432, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.79825973510742, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.872900128364563, "num_tokens": 432534276.0, "step": 11335 }, { "epoch": 1.4420557181020226, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.30554962158203, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8522436618804932, "num_tokens": 432577337.0, "step": 11336 }, { "epoch": 1.4421829283806131, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.72154235839844, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8518617153167725, "num_tokens": 432614290.0, "step": 11337 }, { "epoch": 1.4423101386592037, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 39.98974609375, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8625978231430054, "num_tokens": 432648558.0, "step": 11338 }, { "epoch": 1.4424373489377942, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.016075134277344, "learning_rate": 1e-06, "loss": 0.6366, "mean_token_accuracy": 0.8334102630615234, "num_tokens": 432684243.0, "step": 11339 }, { "epoch": 1.4425645592163847, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.41172409057617, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8601008653640747, "num_tokens": 432719355.0, "step": 11340 }, { "epoch": 1.4426917694949752, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 39.69950485229492, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8612957000732422, "num_tokens": 432758675.0, "step": 11341 }, { "epoch": 1.4428189797735658, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.61285400390625, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8568191528320312, "num_tokens": 432801677.0, "step": 11342 }, { "epoch": 1.4429461900521563, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.52812194824219, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8589102029800415, "num_tokens": 432836299.0, "step": 11343 }, { "epoch": 1.4430734003307468, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.416419982910156, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8595358729362488, "num_tokens": 432868838.0, "step": 11344 }, { "epoch": 1.4432006106093374, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.62765884399414, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8626389503479004, "num_tokens": 432904895.0, "step": 11345 }, { "epoch": 1.4433278208879279, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.71775817871094, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8546472787857056, "num_tokens": 432944785.0, "step": 11346 }, { "epoch": 1.4434550311665182, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 39.80488586425781, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8495882749557495, "num_tokens": 432975645.0, "step": 11347 }, { "epoch": 1.4435822414451087, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.93830108642578, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8545452356338501, "num_tokens": 433016887.0, "step": 11348 }, { "epoch": 1.4437094517236992, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.63163757324219, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8616452217102051, "num_tokens": 433053666.0, "step": 11349 }, { "epoch": 1.4438366620022898, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.4014892578125, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8555203676223755, "num_tokens": 433101131.0, "step": 11350 }, { "epoch": 1.4439638722808803, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.16542053222656, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8673446178436279, "num_tokens": 433132535.0, "step": 11351 }, { "epoch": 1.4440910825594708, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.164188385009766, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8446546196937561, "num_tokens": 433168229.0, "step": 11352 }, { "epoch": 1.4442182928380614, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.1116943359375, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8641372919082642, "num_tokens": 433206094.0, "step": 11353 }, { "epoch": 1.4443455031166519, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.465450286865234, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8475356101989746, "num_tokens": 433241038.0, "step": 11354 }, { "epoch": 1.4444727133952424, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.19749069213867, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8522751331329346, "num_tokens": 433274674.0, "step": 11355 }, { "epoch": 1.4445999236738327, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.374183654785156, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8629481792449951, "num_tokens": 433314511.0, "step": 11356 }, { "epoch": 1.4447271339524232, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.1059455871582, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8619703054428101, "num_tokens": 433349338.0, "step": 11357 }, { "epoch": 1.4448543442310138, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.411624908447266, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8693933486938477, "num_tokens": 433382609.0, "step": 11358 }, { "epoch": 1.4449815545096043, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.289493560791016, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8751204609870911, "num_tokens": 433411553.0, "step": 11359 }, { "epoch": 1.4451087647881948, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.0218505859375, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8555980920791626, "num_tokens": 433450571.0, "step": 11360 }, { "epoch": 1.4452359750667854, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.34642028808594, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8554111123085022, "num_tokens": 433486994.0, "step": 11361 }, { "epoch": 1.445363185345376, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.02107238769531, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8715519905090332, "num_tokens": 433521407.0, "step": 11362 }, { "epoch": 1.4454903956239664, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.68141555786133, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8706828355789185, "num_tokens": 433561338.0, "step": 11363 }, { "epoch": 1.445617605902557, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.46030807495117, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8679580688476562, "num_tokens": 433598892.0, "step": 11364 }, { "epoch": 1.4457448161811475, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.17238998413086, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8500899076461792, "num_tokens": 433641423.0, "step": 11365 }, { "epoch": 1.445872026459738, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.56591033935547, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8638811707496643, "num_tokens": 433677385.0, "step": 11366 }, { "epoch": 1.4459992367383285, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 39.79899978637695, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8593257069587708, "num_tokens": 433716901.0, "step": 11367 }, { "epoch": 1.446126447016919, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.65000534057617, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8548736572265625, "num_tokens": 433755099.0, "step": 11368 }, { "epoch": 1.4462536572955096, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.93907928466797, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8568328619003296, "num_tokens": 433794012.0, "step": 11369 }, { "epoch": 1.4463808675741001, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.94167709350586, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8636367917060852, "num_tokens": 433831965.0, "step": 11370 }, { "epoch": 1.4465080778526904, "ewc_loss": 0.130859375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.000110626220703125, "grad_norm": 39.84349060058594, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.86037677526474, "num_tokens": 433870819.0, "step": 11371 }, { "epoch": 1.446635288131281, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.88020706176758, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8538931012153625, "num_tokens": 433909197.0, "step": 11372 }, { "epoch": 1.4467624984098715, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 39.79194259643555, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8665121793746948, "num_tokens": 433946026.0, "step": 11373 }, { "epoch": 1.446889708688462, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.940162658691406, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.881682276725769, "num_tokens": 433986239.0, "step": 11374 }, { "epoch": 1.4470169189670525, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.664939880371094, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8648170828819275, "num_tokens": 434022557.0, "step": 11375 }, { "epoch": 1.447144129245643, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.775264739990234, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8649476766586304, "num_tokens": 434059258.0, "step": 11376 }, { "epoch": 1.4472713395242336, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.03477478027344, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8439418077468872, "num_tokens": 434094872.0, "step": 11377 }, { "epoch": 1.4473985498028241, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.24529266357422, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8365343809127808, "num_tokens": 434134842.0, "step": 11378 }, { "epoch": 1.4475257600814146, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.222373962402344, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8706174492835999, "num_tokens": 434173152.0, "step": 11379 }, { "epoch": 1.447652970360005, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.92947769165039, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8619223237037659, "num_tokens": 434208318.0, "step": 11380 }, { "epoch": 1.4477801806385955, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.367958068847656, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.857488214969635, "num_tokens": 434243693.0, "step": 11381 }, { "epoch": 1.447907390917186, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.0048942565918, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8484892249107361, "num_tokens": 434283172.0, "step": 11382 }, { "epoch": 1.4480346011957765, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.21471405029297, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8615332245826721, "num_tokens": 434318789.0, "step": 11383 }, { "epoch": 1.448161811474367, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.35750961303711, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8530070185661316, "num_tokens": 434358943.0, "step": 11384 }, { "epoch": 1.4482890217529576, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.37321472167969, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8613858222961426, "num_tokens": 434398264.0, "step": 11385 }, { "epoch": 1.4484162320315481, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.397220611572266, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8703850507736206, "num_tokens": 434433602.0, "step": 11386 }, { "epoch": 1.4485434423101387, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 39.979209899902344, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8550400137901306, "num_tokens": 434480001.0, "step": 11387 }, { "epoch": 1.4486706525887292, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 39.94694519042969, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8743928670883179, "num_tokens": 434519285.0, "step": 11388 }, { "epoch": 1.4487978628673197, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.674922943115234, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8593063354492188, "num_tokens": 434556118.0, "step": 11389 }, { "epoch": 1.4489250731459102, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 39.715145111083984, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8443962335586548, "num_tokens": 434595350.0, "step": 11390 }, { "epoch": 1.4490522834245008, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.28688049316406, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8582153916358948, "num_tokens": 434632988.0, "step": 11391 }, { "epoch": 1.4491794937030913, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.954341888427734, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8677473068237305, "num_tokens": 434673040.0, "step": 11392 }, { "epoch": 1.4493067039816818, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.46034240722656, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8759604096412659, "num_tokens": 434712201.0, "step": 11393 }, { "epoch": 1.4494339142602723, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.83953094482422, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8744151592254639, "num_tokens": 434756676.0, "step": 11394 }, { "epoch": 1.4495611245388629, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.184993743896484, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8638287782669067, "num_tokens": 434797890.0, "step": 11395 }, { "epoch": 1.4496883348174532, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.099754333496094, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8731101155281067, "num_tokens": 434836933.0, "step": 11396 }, { "epoch": 1.4498155450960437, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.38838195800781, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8730297088623047, "num_tokens": 434868098.0, "step": 11397 }, { "epoch": 1.4499427553746342, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.256980895996094, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8552365303039551, "num_tokens": 434907239.0, "step": 11398 }, { "epoch": 1.4500699656532248, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.394805908203125, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8661700487136841, "num_tokens": 434945672.0, "step": 11399 }, { "epoch": 1.4501971759318153, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.26462173461914, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8495185375213623, "num_tokens": 434981693.0, "step": 11400 }, { "epoch": 1.4503243862104058, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.39875411987305, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.882468581199646, "num_tokens": 435012452.0, "step": 11401 }, { "epoch": 1.4504515964889964, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.48706817626953, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8536909222602844, "num_tokens": 435052711.0, "step": 11402 }, { "epoch": 1.4505788067675869, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.15838623046875, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8599033951759338, "num_tokens": 435092766.0, "step": 11403 }, { "epoch": 1.4507060170461774, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.42740249633789, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8452714085578918, "num_tokens": 435132093.0, "step": 11404 }, { "epoch": 1.4508332273247677, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.28203201293945, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8563148975372314, "num_tokens": 435169071.0, "step": 11405 }, { "epoch": 1.4509604376033582, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.61296463012695, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8703882694244385, "num_tokens": 435202665.0, "step": 11406 }, { "epoch": 1.4510876478819488, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.31989288330078, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.875181257724762, "num_tokens": 435234719.0, "step": 11407 }, { "epoch": 1.4512148581605393, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.14275360107422, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8752466440200806, "num_tokens": 435267090.0, "step": 11408 }, { "epoch": 1.4513420684391298, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.386207580566406, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8673535585403442, "num_tokens": 435307243.0, "step": 11409 }, { "epoch": 1.4514692787177204, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.151329040527344, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8680827617645264, "num_tokens": 435347507.0, "step": 11410 }, { "epoch": 1.4515964889963109, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.211769104003906, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8493654727935791, "num_tokens": 435383499.0, "step": 11411 }, { "epoch": 1.4517236992749014, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.311279296875, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8611587285995483, "num_tokens": 435423500.0, "step": 11412 }, { "epoch": 1.451850909553492, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.323158264160156, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8649458885192871, "num_tokens": 435457075.0, "step": 11413 }, { "epoch": 1.4519781198320825, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.312278747558594, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8605929613113403, "num_tokens": 435498933.0, "step": 11414 }, { "epoch": 1.452105330110673, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.5440788269043, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8531913757324219, "num_tokens": 435535949.0, "step": 11415 }, { "epoch": 1.4522325403892635, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.1850471496582, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8535590171813965, "num_tokens": 435565252.0, "step": 11416 }, { "epoch": 1.452359750667854, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.67034149169922, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8419927358627319, "num_tokens": 435604663.0, "step": 11417 }, { "epoch": 1.4524869609464446, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 39.95901107788086, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8559459447860718, "num_tokens": 435640883.0, "step": 11418 }, { "epoch": 1.4526141712250351, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.91070556640625, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8628511428833008, "num_tokens": 435682787.0, "step": 11419 }, { "epoch": 1.4527413815036254, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.0490608215332, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8578258752822876, "num_tokens": 435719982.0, "step": 11420 }, { "epoch": 1.452868591782216, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 41.04328155517578, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.843544602394104, "num_tokens": 435755153.0, "step": 11421 }, { "epoch": 1.4529958020608065, "ewc_loss": 0.1318359375, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011110305786132812, "grad_norm": 40.394954681396484, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8693761825561523, "num_tokens": 435797513.0, "step": 11422 }, { "epoch": 1.453123012339397, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.64071273803711, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8588113784790039, "num_tokens": 435836835.0, "step": 11423 }, { "epoch": 1.4532502226179875, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.876529693603516, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8516904711723328, "num_tokens": 435875124.0, "step": 11424 }, { "epoch": 1.453377432896578, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.7225227355957, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8719052672386169, "num_tokens": 435914993.0, "step": 11425 }, { "epoch": 1.4535046431751686, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.98371887207031, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8693357706069946, "num_tokens": 435950916.0, "step": 11426 }, { "epoch": 1.4536318534537591, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.76779556274414, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8539570569992065, "num_tokens": 435991986.0, "step": 11427 }, { "epoch": 1.4537590637323496, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 41.041725158691406, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.86863112449646, "num_tokens": 436024603.0, "step": 11428 }, { "epoch": 1.45388627401094, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 40.32656478881836, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8550131320953369, "num_tokens": 436065540.0, "step": 11429 }, { "epoch": 1.4540134842895305, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.74089813232422, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8472064733505249, "num_tokens": 436104394.0, "step": 11430 }, { "epoch": 1.454140694568121, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.80055618286133, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8469278216362, "num_tokens": 436139977.0, "step": 11431 }, { "epoch": 1.4542679048467115, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011301040649414062, "grad_norm": 40.10564041137695, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8747793436050415, "num_tokens": 436178047.0, "step": 11432 }, { "epoch": 1.454395115125302, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.87925720214844, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.85837322473526, "num_tokens": 436218331.0, "step": 11433 }, { "epoch": 1.4545223254038926, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.83847427368164, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8586518168449402, "num_tokens": 436255663.0, "step": 11434 }, { "epoch": 1.4546495356824831, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.19512939453125, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.860526442527771, "num_tokens": 436301328.0, "step": 11435 }, { "epoch": 1.4547767459610736, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.186927795410156, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8684778213500977, "num_tokens": 436336487.0, "step": 11436 }, { "epoch": 1.4549039562396642, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.0869026184082, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8623977899551392, "num_tokens": 436377629.0, "step": 11437 }, { "epoch": 1.4550311665182547, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011205673217773438, "grad_norm": 39.86227798461914, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8755825161933899, "num_tokens": 436421590.0, "step": 11438 }, { "epoch": 1.4551583767968452, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.03413391113281, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8518553376197815, "num_tokens": 436461172.0, "step": 11439 }, { "epoch": 1.4552855870754358, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001125335693359375, "grad_norm": 40.047332763671875, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8683031797409058, "num_tokens": 436497536.0, "step": 11440 }, { "epoch": 1.4554127973540263, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.34834671020508, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.85820472240448, "num_tokens": 436543906.0, "step": 11441 }, { "epoch": 1.4555400076326168, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.1751594543457, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.86260986328125, "num_tokens": 436579462.0, "step": 11442 }, { "epoch": 1.4556672179112073, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.95283508300781, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.854109525680542, "num_tokens": 436618840.0, "step": 11443 }, { "epoch": 1.4557944281897979, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.26335525512695, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8490142822265625, "num_tokens": 436654176.0, "step": 11444 }, { "epoch": 1.4559216384683882, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.108642578125, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8742730617523193, "num_tokens": 436685507.0, "step": 11445 }, { "epoch": 1.4560488487469787, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.51153564453125, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8676789999008179, "num_tokens": 436729881.0, "step": 11446 }, { "epoch": 1.4561760590255692, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.681522369384766, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8579310178756714, "num_tokens": 436772355.0, "step": 11447 }, { "epoch": 1.4563032693041598, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.35304260253906, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8543238639831543, "num_tokens": 436808701.0, "step": 11448 }, { "epoch": 1.4564304795827503, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 41.37464141845703, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8700989484786987, "num_tokens": 436845441.0, "step": 11449 }, { "epoch": 1.4565576898613408, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.402713775634766, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8630487322807312, "num_tokens": 436885316.0, "step": 11450 }, { "epoch": 1.4566849001399313, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.77201461791992, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8538812398910522, "num_tokens": 436918315.0, "step": 11451 }, { "epoch": 1.4568121104185219, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.49430465698242, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8529103994369507, "num_tokens": 436956039.0, "step": 11452 }, { "epoch": 1.4569393206971124, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.44304656982422, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.845061719417572, "num_tokens": 436991274.0, "step": 11453 }, { "epoch": 1.4570665309757027, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.422332763671875, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8726127743721008, "num_tokens": 437029963.0, "step": 11454 }, { "epoch": 1.4571937412542932, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.64152908325195, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8531641960144043, "num_tokens": 437067030.0, "step": 11455 }, { "epoch": 1.4573209515328838, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.288368225097656, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8667147159576416, "num_tokens": 437103363.0, "step": 11456 }, { "epoch": 1.4574481618114743, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.79588317871094, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8658271431922913, "num_tokens": 437131574.0, "step": 11457 }, { "epoch": 1.4575753720900648, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.344356536865234, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8539680242538452, "num_tokens": 437169537.0, "step": 11458 }, { "epoch": 1.4577025823686554, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.46745300292969, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8657517433166504, "num_tokens": 437208025.0, "step": 11459 }, { "epoch": 1.4578297926472459, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.36922073364258, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8715183734893799, "num_tokens": 437246642.0, "step": 11460 }, { "epoch": 1.4579570029258364, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.5900993347168, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8562973737716675, "num_tokens": 437285274.0, "step": 11461 }, { "epoch": 1.458084213204427, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 39.89002227783203, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8769210577011108, "num_tokens": 437325865.0, "step": 11462 }, { "epoch": 1.4582114234830175, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.41425704956055, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8656603693962097, "num_tokens": 437364093.0, "step": 11463 }, { "epoch": 1.458338633761608, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.54931640625, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8477733135223389, "num_tokens": 437402413.0, "step": 11464 }, { "epoch": 1.4584658440401985, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.09807586669922, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8474032878875732, "num_tokens": 437442583.0, "step": 11465 }, { "epoch": 1.458593054318789, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.50807189941406, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8591171503067017, "num_tokens": 437485682.0, "step": 11466 }, { "epoch": 1.4587202645973796, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 40.27680969238281, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8501049280166626, "num_tokens": 437524859.0, "step": 11467 }, { "epoch": 1.45884747487597, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.59928894042969, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8524085283279419, "num_tokens": 437563590.0, "step": 11468 }, { "epoch": 1.4589746851545604, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.299312591552734, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8763665556907654, "num_tokens": 437601429.0, "step": 11469 }, { "epoch": 1.459101895433151, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.354976654052734, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8478929996490479, "num_tokens": 437639144.0, "step": 11470 }, { "epoch": 1.4592291057117415, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.37561798095703, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8685475587844849, "num_tokens": 437680846.0, "step": 11471 }, { "epoch": 1.459356315990332, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.68183135986328, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8618168830871582, "num_tokens": 437722156.0, "step": 11472 }, { "epoch": 1.4594835262689225, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.073822021484375, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8371955752372742, "num_tokens": 437766905.0, "step": 11473 }, { "epoch": 1.459610736547513, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.78566360473633, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8604195713996887, "num_tokens": 437801425.0, "step": 11474 }, { "epoch": 1.4597379468261036, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.70671081542969, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8499589562416077, "num_tokens": 437838126.0, "step": 11475 }, { "epoch": 1.4598651571046941, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.83211135864258, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8556837439537048, "num_tokens": 437876597.0, "step": 11476 }, { "epoch": 1.4599923673832846, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.84807586669922, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8723846673965454, "num_tokens": 437911852.0, "step": 11477 }, { "epoch": 1.460119577661875, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.56147766113281, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8603823184967041, "num_tokens": 437953418.0, "step": 11478 }, { "epoch": 1.4602467879404655, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.109554290771484, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8664933443069458, "num_tokens": 437989561.0, "step": 11479 }, { "epoch": 1.460373998219056, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.19272232055664, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8687976598739624, "num_tokens": 438033464.0, "step": 11480 }, { "epoch": 1.4605012084976465, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.33792495727539, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8648689985275269, "num_tokens": 438068676.0, "step": 11481 }, { "epoch": 1.460628418776237, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.11446762084961, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8418529033660889, "num_tokens": 438107867.0, "step": 11482 }, { "epoch": 1.4607556290548276, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.410362243652344, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8767603039741516, "num_tokens": 438144256.0, "step": 11483 }, { "epoch": 1.4608828393334181, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.80609130859375, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8695499300956726, "num_tokens": 438183458.0, "step": 11484 }, { "epoch": 1.4610100496120086, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.10011672973633, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8490803241729736, "num_tokens": 438216025.0, "step": 11485 }, { "epoch": 1.4611372598905992, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.84779357910156, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8717722296714783, "num_tokens": 438258185.0, "step": 11486 }, { "epoch": 1.4612644701691897, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.19719696044922, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8562558889389038, "num_tokens": 438298189.0, "step": 11487 }, { "epoch": 1.4613916804477802, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.719886779785156, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.850979745388031, "num_tokens": 438336593.0, "step": 11488 }, { "epoch": 1.4615188907263708, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.96663284301758, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8666195869445801, "num_tokens": 438374110.0, "step": 11489 }, { "epoch": 1.4616461010049613, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.70437240600586, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8434205651283264, "num_tokens": 438413062.0, "step": 11490 }, { "epoch": 1.4617733112835518, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.97001266479492, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8633260726928711, "num_tokens": 438452055.0, "step": 11491 }, { "epoch": 1.4619005215621423, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.0351676940918, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8632705211639404, "num_tokens": 438494224.0, "step": 11492 }, { "epoch": 1.4620277318407329, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 39.8780403137207, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8560848236083984, "num_tokens": 438529790.0, "step": 11493 }, { "epoch": 1.4621549421193232, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.24205017089844, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8655415177345276, "num_tokens": 438571371.0, "step": 11494 }, { "epoch": 1.4622821523979137, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 39.987754821777344, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8468809723854065, "num_tokens": 438612383.0, "step": 11495 }, { "epoch": 1.4624093626765042, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.26899719238281, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8511318564414978, "num_tokens": 438653443.0, "step": 11496 }, { "epoch": 1.4625365729550948, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 39.96561050415039, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8487558364868164, "num_tokens": 438689565.0, "step": 11497 }, { "epoch": 1.4626637832336853, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.11885070800781, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8608015775680542, "num_tokens": 438727753.0, "step": 11498 }, { "epoch": 1.4627909935122758, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.08975601196289, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8592977523803711, "num_tokens": 438764667.0, "step": 11499 }, { "epoch": 1.4629182037908663, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.89772415161133, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8672996759414673, "num_tokens": 438801739.0, "step": 11500 }, { "epoch": 1.4630454140694569, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.4394645690918, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.861823558807373, "num_tokens": 438840028.0, "step": 11501 }, { "epoch": 1.4631726243480474, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.45866012573242, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.858721137046814, "num_tokens": 438880632.0, "step": 11502 }, { "epoch": 1.4632998346266377, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.39238357543945, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8570200204849243, "num_tokens": 438923040.0, "step": 11503 }, { "epoch": 1.4634270449052282, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.13271713256836, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8612172603607178, "num_tokens": 438958597.0, "step": 11504 }, { "epoch": 1.4635542551838188, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.4436149597168, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8486205339431763, "num_tokens": 438997909.0, "step": 11505 }, { "epoch": 1.4636814654624093, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.20733642578125, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8677233457565308, "num_tokens": 439030084.0, "step": 11506 }, { "epoch": 1.4638086757409998, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.30620574951172, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8623205423355103, "num_tokens": 439067770.0, "step": 11507 }, { "epoch": 1.4639358860195903, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.36799621582031, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8632553219795227, "num_tokens": 439102476.0, "step": 11508 }, { "epoch": 1.4640630962981809, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.19065475463867, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8699970245361328, "num_tokens": 439138660.0, "step": 11509 }, { "epoch": 1.4641903065767714, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.25855255126953, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8684523701667786, "num_tokens": 439181690.0, "step": 11510 }, { "epoch": 1.464317516855362, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.28157424926758, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8649424910545349, "num_tokens": 439218825.0, "step": 11511 }, { "epoch": 1.4644447271339525, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.0326042175293, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8660328388214111, "num_tokens": 439253505.0, "step": 11512 }, { "epoch": 1.464571937412543, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.225379943847656, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8549456596374512, "num_tokens": 439292764.0, "step": 11513 }, { "epoch": 1.4646991476911335, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.40620040893555, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8697411417961121, "num_tokens": 439335269.0, "step": 11514 }, { "epoch": 1.464826357969724, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.241275787353516, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.862586498260498, "num_tokens": 439371977.0, "step": 11515 }, { "epoch": 1.4649535682483146, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.30748748779297, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8601992726325989, "num_tokens": 439412710.0, "step": 11516 }, { "epoch": 1.465080778526905, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.11786651611328, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8596376776695251, "num_tokens": 439449637.0, "step": 11517 }, { "epoch": 1.4652079888054954, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.52067184448242, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8553631901741028, "num_tokens": 439495143.0, "step": 11518 }, { "epoch": 1.465335199084086, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 39.8359489440918, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8484630584716797, "num_tokens": 439532721.0, "step": 11519 }, { "epoch": 1.4654624093626765, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.771636962890625, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8700825572013855, "num_tokens": 439567938.0, "step": 11520 }, { "epoch": 1.465589619641267, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.08279037475586, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8681786060333252, "num_tokens": 439595720.0, "step": 11521 }, { "epoch": 1.4657168299198575, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.47358322143555, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8609613180160522, "num_tokens": 439634091.0, "step": 11522 }, { "epoch": 1.465844040198448, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.39717102050781, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8491576313972473, "num_tokens": 439671978.0, "step": 11523 }, { "epoch": 1.4659712504770386, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.02664566040039, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8544081449508667, "num_tokens": 439707748.0, "step": 11524 }, { "epoch": 1.466098460755629, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.429931640625, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8676199316978455, "num_tokens": 439744060.0, "step": 11525 }, { "epoch": 1.4662256710342196, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.00287628173828, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8530609011650085, "num_tokens": 439789277.0, "step": 11526 }, { "epoch": 1.46635288131281, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.63890838623047, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8793531656265259, "num_tokens": 439827524.0, "step": 11527 }, { "epoch": 1.4664800915914005, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 39.661460876464844, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8563135862350464, "num_tokens": 439865181.0, "step": 11528 }, { "epoch": 1.466607301869991, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.64011001586914, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8530826568603516, "num_tokens": 439900071.0, "step": 11529 }, { "epoch": 1.4667345121485815, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 39.59783172607422, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8601081371307373, "num_tokens": 439940839.0, "step": 11530 }, { "epoch": 1.466861722427172, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.533565521240234, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8579437732696533, "num_tokens": 439978908.0, "step": 11531 }, { "epoch": 1.4669889327057626, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 39.60784912109375, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8598894476890564, "num_tokens": 440013448.0, "step": 11532 }, { "epoch": 1.467116142984353, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.23646545410156, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8647679686546326, "num_tokens": 440048377.0, "step": 11533 }, { "epoch": 1.4672433532629436, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 39.8876838684082, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.854032039642334, "num_tokens": 440089806.0, "step": 11534 }, { "epoch": 1.4673705635415342, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.18810272216797, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.865114152431488, "num_tokens": 440124209.0, "step": 11535 }, { "epoch": 1.4674977738201247, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 39.638763427734375, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8632186055183411, "num_tokens": 440158349.0, "step": 11536 }, { "epoch": 1.4676249840987152, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 39.94704818725586, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8606424331665039, "num_tokens": 440195220.0, "step": 11537 }, { "epoch": 1.4677521943773058, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 39.90185546875, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8604462146759033, "num_tokens": 440235158.0, "step": 11538 }, { "epoch": 1.4678794046558963, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.01434326171875, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8546673059463501, "num_tokens": 440271879.0, "step": 11539 }, { "epoch": 1.4680066149344868, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.08887481689453, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8586641550064087, "num_tokens": 440306454.0, "step": 11540 }, { "epoch": 1.4681338252130773, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.31834411621094, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8488049507141113, "num_tokens": 440341185.0, "step": 11541 }, { "epoch": 1.4682610354916679, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.01453399658203, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8588689565658569, "num_tokens": 440379168.0, "step": 11542 }, { "epoch": 1.4683882457702582, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.01420593261719, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8623979091644287, "num_tokens": 440417945.0, "step": 11543 }, { "epoch": 1.4685154560488487, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.500144958496094, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8440889716148376, "num_tokens": 440457864.0, "step": 11544 }, { "epoch": 1.4686426663274392, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 39.79066467285156, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8556513786315918, "num_tokens": 440491709.0, "step": 11545 }, { "epoch": 1.4687698766060298, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.259765625, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8654816150665283, "num_tokens": 440533570.0, "step": 11546 }, { "epoch": 1.4688970868846203, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.07020950317383, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8645967841148376, "num_tokens": 440568360.0, "step": 11547 }, { "epoch": 1.4690242971632108, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.006080627441406, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.871286153793335, "num_tokens": 440606848.0, "step": 11548 }, { "epoch": 1.4691515074418013, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.478599548339844, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8726804256439209, "num_tokens": 440638254.0, "step": 11549 }, { "epoch": 1.4692787177203919, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.054019927978516, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8665018677711487, "num_tokens": 440674122.0, "step": 11550 }, { "epoch": 1.4694059279989824, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.58188247680664, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8747673034667969, "num_tokens": 440718362.0, "step": 11551 }, { "epoch": 1.4695331382775727, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.31753921508789, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8680965304374695, "num_tokens": 440755971.0, "step": 11552 }, { "epoch": 1.4696603485561632, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.26677703857422, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8585017919540405, "num_tokens": 440794328.0, "step": 11553 }, { "epoch": 1.4697875588347538, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.1457633972168, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8481591939926147, "num_tokens": 440833383.0, "step": 11554 }, { "epoch": 1.4699147691133443, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.287208557128906, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8669151663780212, "num_tokens": 440875502.0, "step": 11555 }, { "epoch": 1.4700419793919348, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.351566314697266, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8645422458648682, "num_tokens": 440915259.0, "step": 11556 }, { "epoch": 1.4701691896705253, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.54714584350586, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8566491007804871, "num_tokens": 440951660.0, "step": 11557 }, { "epoch": 1.4702963999491159, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 39.6663818359375, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8608570694923401, "num_tokens": 440992533.0, "step": 11558 }, { "epoch": 1.4704236102277064, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.5215950012207, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8506633639335632, "num_tokens": 441027360.0, "step": 11559 }, { "epoch": 1.470550820506297, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 39.940860748291016, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8669648170471191, "num_tokens": 441071746.0, "step": 11560 }, { "epoch": 1.4706780307848875, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.4279670715332, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8601542115211487, "num_tokens": 441105378.0, "step": 11561 }, { "epoch": 1.470805241063478, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.41926193237305, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8608889579772949, "num_tokens": 441150000.0, "step": 11562 }, { "epoch": 1.4709324513420685, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.09083938598633, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.853793203830719, "num_tokens": 441188426.0, "step": 11563 }, { "epoch": 1.471059661620659, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.80885696411133, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8677617907524109, "num_tokens": 441226028.0, "step": 11564 }, { "epoch": 1.4711868718992496, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.470733642578125, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8619933724403381, "num_tokens": 441265788.0, "step": 11565 }, { "epoch": 1.47131408217784, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.36147689819336, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8674578070640564, "num_tokens": 441297280.0, "step": 11566 }, { "epoch": 1.4714412924564304, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.320106506347656, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8608088493347168, "num_tokens": 441331229.0, "step": 11567 }, { "epoch": 1.471568502735021, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.400272369384766, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8640331029891968, "num_tokens": 441371064.0, "step": 11568 }, { "epoch": 1.4716957130136115, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.33119583129883, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8419638872146606, "num_tokens": 441405706.0, "step": 11569 }, { "epoch": 1.471822923292202, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 39.957332611083984, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8506792783737183, "num_tokens": 441445966.0, "step": 11570 }, { "epoch": 1.4719501335707925, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.44258117675781, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8450692892074585, "num_tokens": 441478916.0, "step": 11571 }, { "epoch": 1.472077343849383, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 39.798439025878906, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8563498258590698, "num_tokens": 441519245.0, "step": 11572 }, { "epoch": 1.4722045541279736, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.81171798706055, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8523897528648376, "num_tokens": 441557485.0, "step": 11573 }, { "epoch": 1.472331764406564, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.03578186035156, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8707234263420105, "num_tokens": 441596042.0, "step": 11574 }, { "epoch": 1.4724589746851546, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.50449752807617, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8703458309173584, "num_tokens": 441634391.0, "step": 11575 }, { "epoch": 1.472586184963745, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.58188247680664, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8519668579101562, "num_tokens": 441672201.0, "step": 11576 }, { "epoch": 1.4727133952423355, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.25440216064453, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8587497472763062, "num_tokens": 441713350.0, "step": 11577 }, { "epoch": 1.472840605520926, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.64744567871094, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8589996695518494, "num_tokens": 441754677.0, "step": 11578 }, { "epoch": 1.4729678157995165, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.56795883178711, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8490150570869446, "num_tokens": 441797037.0, "step": 11579 }, { "epoch": 1.473095026078107, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.3529052734375, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8572756052017212, "num_tokens": 441831661.0, "step": 11580 }, { "epoch": 1.4732222363566976, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.433345794677734, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8632819652557373, "num_tokens": 441876933.0, "step": 11581 }, { "epoch": 1.473349446635288, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.22221374511719, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8803311586380005, "num_tokens": 441918015.0, "step": 11582 }, { "epoch": 1.4734766569138786, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.23183059692383, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8528296947479248, "num_tokens": 441956573.0, "step": 11583 }, { "epoch": 1.4736038671924692, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.598854064941406, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8748635053634644, "num_tokens": 442002064.0, "step": 11584 }, { "epoch": 1.4737310774710597, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.45268249511719, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8670635223388672, "num_tokens": 442040122.0, "step": 11585 }, { "epoch": 1.4738582877496502, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.82122802734375, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8626044988632202, "num_tokens": 442079698.0, "step": 11586 }, { "epoch": 1.4739854980282407, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.14360809326172, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8742309212684631, "num_tokens": 442118252.0, "step": 11587 }, { "epoch": 1.4741127083068313, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.87007522583008, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8640726208686829, "num_tokens": 442154675.0, "step": 11588 }, { "epoch": 1.4742399185854218, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.410865783691406, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8665844798088074, "num_tokens": 442189449.0, "step": 11589 }, { "epoch": 1.4743671288640123, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.87874221801758, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8639892935752869, "num_tokens": 442222844.0, "step": 11590 }, { "epoch": 1.4744943391426026, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.59336471557617, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.866041362285614, "num_tokens": 442258599.0, "step": 11591 }, { "epoch": 1.4746215494211932, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.77240753173828, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8696441650390625, "num_tokens": 442294775.0, "step": 11592 }, { "epoch": 1.4747487596997837, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.834716796875, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8723006844520569, "num_tokens": 442330313.0, "step": 11593 }, { "epoch": 1.4748759699783742, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.96137619018555, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8601548671722412, "num_tokens": 442364988.0, "step": 11594 }, { "epoch": 1.4750031802569648, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.427669525146484, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.862133264541626, "num_tokens": 442406927.0, "step": 11595 }, { "epoch": 1.4751303905355553, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.53081130981445, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8644713163375854, "num_tokens": 442442092.0, "step": 11596 }, { "epoch": 1.4752576008141458, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.089027404785156, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8679277896881104, "num_tokens": 442483013.0, "step": 11597 }, { "epoch": 1.4753848110927363, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.21849060058594, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8655064105987549, "num_tokens": 442525052.0, "step": 11598 }, { "epoch": 1.4755120213713269, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.75514221191406, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8514243364334106, "num_tokens": 442562543.0, "step": 11599 }, { "epoch": 1.4756392316499174, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.77220153808594, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8657670021057129, "num_tokens": 442604976.0, "step": 11600 }, { "epoch": 1.4757664419285077, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.60468292236328, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8733211755752563, "num_tokens": 442642271.0, "step": 11601 }, { "epoch": 1.4758936522070982, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.83309555053711, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8559200167655945, "num_tokens": 442680508.0, "step": 11602 }, { "epoch": 1.4760208624856888, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.79351806640625, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8494262099266052, "num_tokens": 442718391.0, "step": 11603 }, { "epoch": 1.4761480727642793, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.72627639770508, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8713047504425049, "num_tokens": 442757688.0, "step": 11604 }, { "epoch": 1.4762752830428698, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.82029342651367, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8742659091949463, "num_tokens": 442796704.0, "step": 11605 }, { "epoch": 1.4764024933214603, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.484901428222656, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8715306520462036, "num_tokens": 442838791.0, "step": 11606 }, { "epoch": 1.4765297036000509, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.70213317871094, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8617806434631348, "num_tokens": 442882368.0, "step": 11607 }, { "epoch": 1.4766569138786414, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.811519622802734, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8625247478485107, "num_tokens": 442923768.0, "step": 11608 }, { "epoch": 1.476784124157232, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.63303756713867, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.875937819480896, "num_tokens": 442955656.0, "step": 11609 }, { "epoch": 1.4769113344358225, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.14895248413086, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8526129722595215, "num_tokens": 442997323.0, "step": 11610 }, { "epoch": 1.477038544714413, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.614158630371094, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8467631936073303, "num_tokens": 443033454.0, "step": 11611 }, { "epoch": 1.4771657549930035, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.12819290161133, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8604382276535034, "num_tokens": 443077902.0, "step": 11612 }, { "epoch": 1.477292965271594, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.37335968017578, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8524271249771118, "num_tokens": 443116515.0, "step": 11613 }, { "epoch": 1.4774201755501846, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.96860885620117, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8870620727539062, "num_tokens": 443154124.0, "step": 11614 }, { "epoch": 1.477547385828775, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 41.024131774902344, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.869544506072998, "num_tokens": 443195852.0, "step": 11615 }, { "epoch": 1.4776745961073654, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.87008285522461, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8532383441925049, "num_tokens": 443229850.0, "step": 11616 }, { "epoch": 1.477801806385956, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.930301666259766, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8531800508499146, "num_tokens": 443274481.0, "step": 11617 }, { "epoch": 1.4779290166645465, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.40352249145508, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8684017658233643, "num_tokens": 443311737.0, "step": 11618 }, { "epoch": 1.478056226943137, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.952518463134766, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8536439538002014, "num_tokens": 443345310.0, "step": 11619 }, { "epoch": 1.4781834372217275, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.28423309326172, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8606160879135132, "num_tokens": 443392869.0, "step": 11620 }, { "epoch": 1.478310647500318, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.489288330078125, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8603876233100891, "num_tokens": 443432842.0, "step": 11621 }, { "epoch": 1.4784378577789086, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.439144134521484, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.863539457321167, "num_tokens": 443465954.0, "step": 11622 }, { "epoch": 1.478565068057499, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.80486297607422, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8507561683654785, "num_tokens": 443508506.0, "step": 11623 }, { "epoch": 1.4786922783360896, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.948455810546875, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8593415021896362, "num_tokens": 443543575.0, "step": 11624 }, { "epoch": 1.47881948861468, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.35657501220703, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8651061058044434, "num_tokens": 443578504.0, "step": 11625 }, { "epoch": 1.4789466988932705, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.35610580444336, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8740426301956177, "num_tokens": 443614647.0, "step": 11626 }, { "epoch": 1.479073909171861, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.28834915161133, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8569433689117432, "num_tokens": 443649244.0, "step": 11627 }, { "epoch": 1.4792011194504515, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.041038513183594, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8562605381011963, "num_tokens": 443688374.0, "step": 11628 }, { "epoch": 1.479328329729042, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.42146682739258, "learning_rate": 1e-06, "loss": 0.6461, "mean_token_accuracy": 0.8301644325256348, "num_tokens": 443730929.0, "step": 11629 }, { "epoch": 1.4794555400076326, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.015872955322266, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.85389244556427, "num_tokens": 443763067.0, "step": 11630 }, { "epoch": 1.479582750286223, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.909671783447266, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8490999937057495, "num_tokens": 443801273.0, "step": 11631 }, { "epoch": 1.4797099605648136, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.74398422241211, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8492122888565063, "num_tokens": 443840394.0, "step": 11632 }, { "epoch": 1.4798371708434042, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.033756256103516, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8564273118972778, "num_tokens": 443877486.0, "step": 11633 }, { "epoch": 1.4799643811219947, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.58110046386719, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8600270748138428, "num_tokens": 443913763.0, "step": 11634 }, { "epoch": 1.4800915914005852, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.93832778930664, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8686810731887817, "num_tokens": 443946840.0, "step": 11635 }, { "epoch": 1.4802188016791757, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.72267532348633, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8458375334739685, "num_tokens": 443992439.0, "step": 11636 }, { "epoch": 1.4803460119577663, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.054996490478516, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.869746208190918, "num_tokens": 444029348.0, "step": 11637 }, { "epoch": 1.4804732222363568, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.89636993408203, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8704671859741211, "num_tokens": 444068386.0, "step": 11638 }, { "epoch": 1.4806004325149473, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.78013610839844, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8497509956359863, "num_tokens": 444110307.0, "step": 11639 }, { "epoch": 1.4807276427935376, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.357059478759766, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.859533965587616, "num_tokens": 444153919.0, "step": 11640 }, { "epoch": 1.4808548530721282, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.45061492919922, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8637313842773438, "num_tokens": 444190575.0, "step": 11641 }, { "epoch": 1.4809820633507187, "ewc_loss": 0.140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 55.06066131591797, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8553240299224854, "num_tokens": 444226108.0, "step": 11642 }, { "epoch": 1.4811092736293092, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.95314407348633, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8585510849952698, "num_tokens": 444268162.0, "step": 11643 }, { "epoch": 1.4812364839078997, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.295833587646484, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8482257127761841, "num_tokens": 444309273.0, "step": 11644 }, { "epoch": 1.4813636941864903, "ewc_loss": 0.1328125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011157989501953125, "grad_norm": 40.6932373046875, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8639315962791443, "num_tokens": 444338697.0, "step": 11645 }, { "epoch": 1.4814909044650808, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.11610412597656, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8703818321228027, "num_tokens": 444377954.0, "step": 11646 }, { "epoch": 1.4816181147436713, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 41.20439147949219, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8613197207450867, "num_tokens": 444416996.0, "step": 11647 }, { "epoch": 1.4817453250222619, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 42.44064712524414, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8665874004364014, "num_tokens": 444455550.0, "step": 11648 }, { "epoch": 1.4818725353008524, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.12382507324219, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8560743927955627, "num_tokens": 444493979.0, "step": 11649 }, { "epoch": 1.4819997455794427, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 42.09923553466797, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8595532774925232, "num_tokens": 444540622.0, "step": 11650 }, { "epoch": 1.4821269558580332, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.1994514465332, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8570212125778198, "num_tokens": 444577772.0, "step": 11651 }, { "epoch": 1.4822541661366238, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.57680892944336, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8478808403015137, "num_tokens": 444615523.0, "step": 11652 }, { "epoch": 1.4823813764152143, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.610939025878906, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8603333234786987, "num_tokens": 444659785.0, "step": 11653 }, { "epoch": 1.4825085866938048, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.20699691772461, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8560410737991333, "num_tokens": 444700806.0, "step": 11654 }, { "epoch": 1.4826357969723953, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.30086898803711, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8617851138114929, "num_tokens": 444734320.0, "step": 11655 }, { "epoch": 1.4827630072509859, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.404212951660156, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.854685366153717, "num_tokens": 444770215.0, "step": 11656 }, { "epoch": 1.4828902175295764, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.04356002807617, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8603769540786743, "num_tokens": 444805432.0, "step": 11657 }, { "epoch": 1.483017427808167, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.52495193481445, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.859919011592865, "num_tokens": 444832398.0, "step": 11658 }, { "epoch": 1.4831446380867574, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 41.60443878173828, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8661810159683228, "num_tokens": 444876128.0, "step": 11659 }, { "epoch": 1.483271848365348, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.497684478759766, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8545759916305542, "num_tokens": 444915188.0, "step": 11660 }, { "epoch": 1.4833990586439385, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.45783615112305, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8658278584480286, "num_tokens": 444953528.0, "step": 11661 }, { "epoch": 1.483526268922529, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 41.37638473510742, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8548158407211304, "num_tokens": 444996469.0, "step": 11662 }, { "epoch": 1.4836534792011196, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.19548416137695, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8765625953674316, "num_tokens": 445034298.0, "step": 11663 }, { "epoch": 1.48378068947971, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.13155746459961, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8595162630081177, "num_tokens": 445079010.0, "step": 11664 }, { "epoch": 1.4839078997583004, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.59543991088867, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8749299049377441, "num_tokens": 445116009.0, "step": 11665 }, { "epoch": 1.484035110036891, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011491775512695312, "grad_norm": 41.18254089355469, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8740904331207275, "num_tokens": 445156313.0, "step": 11666 }, { "epoch": 1.4841623203154815, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.92082595825195, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.860541582107544, "num_tokens": 445187424.0, "step": 11667 }, { "epoch": 1.484289530594072, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.800018310546875, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8603194355964661, "num_tokens": 445233448.0, "step": 11668 }, { "epoch": 1.4844167408726625, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.474300384521484, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.870757520198822, "num_tokens": 445273663.0, "step": 11669 }, { "epoch": 1.484543951151253, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.49091339111328, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8653389811515808, "num_tokens": 445313790.0, "step": 11670 }, { "epoch": 1.4846711614298436, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.77228546142578, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8713709115982056, "num_tokens": 445351818.0, "step": 11671 }, { "epoch": 1.484798371708434, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.99502182006836, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8353320360183716, "num_tokens": 445391763.0, "step": 11672 }, { "epoch": 1.4849255819870246, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.14191818237305, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8547973036766052, "num_tokens": 445426723.0, "step": 11673 }, { "epoch": 1.485052792265615, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.704105377197266, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8654601573944092, "num_tokens": 445465635.0, "step": 11674 }, { "epoch": 1.4851800025442055, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.8974494934082, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8607666492462158, "num_tokens": 445502819.0, "step": 11675 }, { "epoch": 1.485307212822796, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 42.114593505859375, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8703405857086182, "num_tokens": 445541861.0, "step": 11676 }, { "epoch": 1.4854344231013865, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.97404479980469, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8750297427177429, "num_tokens": 445578058.0, "step": 11677 }, { "epoch": 1.485561633379977, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.51515197753906, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8471658229827881, "num_tokens": 445612896.0, "step": 11678 }, { "epoch": 1.4856888436585676, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 41.63383865356445, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8539620637893677, "num_tokens": 445647849.0, "step": 11679 }, { "epoch": 1.485816053937158, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.991825103759766, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8452340364456177, "num_tokens": 445689199.0, "step": 11680 }, { "epoch": 1.4859432642157486, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 42.4854621887207, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8581956624984741, "num_tokens": 445727869.0, "step": 11681 }, { "epoch": 1.4860704744943392, "ewc_loss": 0.1337890625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011348724365234375, "grad_norm": 40.561981201171875, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8643187284469604, "num_tokens": 445770105.0, "step": 11682 }, { "epoch": 1.4861976847729297, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.53330612182617, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.840091347694397, "num_tokens": 445812950.0, "step": 11683 }, { "epoch": 1.4863248950515202, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 40.976070404052734, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.857914924621582, "num_tokens": 445850811.0, "step": 11684 }, { "epoch": 1.4864521053301107, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.98569107055664, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8637530207633972, "num_tokens": 445891967.0, "step": 11685 }, { "epoch": 1.4865793156087013, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.01307678222656, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8617919683456421, "num_tokens": 445927936.0, "step": 11686 }, { "epoch": 1.4867065258872918, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.660831451416016, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8739094734191895, "num_tokens": 445962481.0, "step": 11687 }, { "epoch": 1.4868337361658823, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.760921478271484, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8742846250534058, "num_tokens": 446001445.0, "step": 11688 }, { "epoch": 1.4869609464444726, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.644596099853516, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8811682462692261, "num_tokens": 446037904.0, "step": 11689 }, { "epoch": 1.4870881567230632, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.3714485168457, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8691154718399048, "num_tokens": 446077583.0, "step": 11690 }, { "epoch": 1.4872153670016537, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.5936279296875, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8591929078102112, "num_tokens": 446110123.0, "step": 11691 }, { "epoch": 1.4873425772802442, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.44702911376953, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.858374834060669, "num_tokens": 446149832.0, "step": 11692 }, { "epoch": 1.4874697875588347, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.63826370239258, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8657066822052002, "num_tokens": 446184211.0, "step": 11693 }, { "epoch": 1.4875969978374253, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.33936309814453, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8552317023277283, "num_tokens": 446229620.0, "step": 11694 }, { "epoch": 1.4877242081160158, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.72350311279297, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8629244565963745, "num_tokens": 446267134.0, "step": 11695 }, { "epoch": 1.4878514183946063, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.00849533081055, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8466366529464722, "num_tokens": 446303001.0, "step": 11696 }, { "epoch": 1.4879786286731969, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.837318420410156, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.859941840171814, "num_tokens": 446334286.0, "step": 11697 }, { "epoch": 1.4881058389517874, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.849159240722656, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8795302510261536, "num_tokens": 446369547.0, "step": 11698 }, { "epoch": 1.4882330492303777, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.962039947509766, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.844789445400238, "num_tokens": 446408179.0, "step": 11699 }, { "epoch": 1.4883602595089682, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.991661071777344, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8626701831817627, "num_tokens": 446448533.0, "step": 11700 }, { "epoch": 1.4884874697875587, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.345062255859375, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8576520085334778, "num_tokens": 446489445.0, "step": 11701 }, { "epoch": 1.4886146800661493, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.750877380371094, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.874252438545227, "num_tokens": 446526530.0, "step": 11702 }, { "epoch": 1.4887418903447398, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.79220962524414, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8616147041320801, "num_tokens": 446558315.0, "step": 11703 }, { "epoch": 1.4888691006233303, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.10985565185547, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8649179339408875, "num_tokens": 446598548.0, "step": 11704 }, { "epoch": 1.4889963109019209, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 42.46913528442383, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8558914661407471, "num_tokens": 446632111.0, "step": 11705 }, { "epoch": 1.4891235211805114, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011396408081054688, "grad_norm": 40.23294448852539, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8699970245361328, "num_tokens": 446669296.0, "step": 11706 }, { "epoch": 1.489250731459102, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.1716194152832, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8637784719467163, "num_tokens": 446707891.0, "step": 11707 }, { "epoch": 1.4893779417376924, "ewc_loss": 0.134765625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011444091796875, "grad_norm": 41.085174560546875, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8488552570343018, "num_tokens": 446749455.0, "step": 11708 }, { "epoch": 1.489505152016283, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.42294692993164, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8736582398414612, "num_tokens": 446787001.0, "step": 11709 }, { "epoch": 1.4896323622948735, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.013343811035156, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8593865633010864, "num_tokens": 446827262.0, "step": 11710 }, { "epoch": 1.489759572573464, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.86827087402344, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8753908276557922, "num_tokens": 446860514.0, "step": 11711 }, { "epoch": 1.4898867828520546, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.18672561645508, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8515884876251221, "num_tokens": 446902605.0, "step": 11712 }, { "epoch": 1.490013993130645, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.1396598815918, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8676400184631348, "num_tokens": 446938638.0, "step": 11713 }, { "epoch": 1.4901412034092354, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.92299270629883, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8663400411605835, "num_tokens": 446979286.0, "step": 11714 }, { "epoch": 1.490268413687826, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.431243896484375, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8452552556991577, "num_tokens": 447017405.0, "step": 11715 }, { "epoch": 1.4903956239664164, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.786529541015625, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8642244338989258, "num_tokens": 447056733.0, "step": 11716 }, { "epoch": 1.490522834245007, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.085182189941406, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8552283048629761, "num_tokens": 447094493.0, "step": 11717 }, { "epoch": 1.4906500445235975, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 41.17812728881836, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8768537640571594, "num_tokens": 447134039.0, "step": 11718 }, { "epoch": 1.490777254802188, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.30228805541992, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8698737025260925, "num_tokens": 447168320.0, "step": 11719 }, { "epoch": 1.4909044650807786, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.66777801513672, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8450238108634949, "num_tokens": 447205640.0, "step": 11720 }, { "epoch": 1.491031675359369, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.329864501953125, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8739688396453857, "num_tokens": 447239831.0, "step": 11721 }, { "epoch": 1.4911588856379596, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.64612579345703, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8460763692855835, "num_tokens": 447281100.0, "step": 11722 }, { "epoch": 1.49128609591655, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.956695556640625, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8420334458351135, "num_tokens": 447316137.0, "step": 11723 }, { "epoch": 1.4914133061951405, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.592464447021484, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8545807003974915, "num_tokens": 447357922.0, "step": 11724 }, { "epoch": 1.491540516473731, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.933311462402344, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8571338057518005, "num_tokens": 447394905.0, "step": 11725 }, { "epoch": 1.4916677267523215, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.7208251953125, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8721818923950195, "num_tokens": 447435379.0, "step": 11726 }, { "epoch": 1.491794937030912, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.14125442504883, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8514463901519775, "num_tokens": 447473058.0, "step": 11727 }, { "epoch": 1.4919221473095026, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.7456169128418, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8713427782058716, "num_tokens": 447510966.0, "step": 11728 }, { "epoch": 1.492049357588093, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.1883544921875, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.864986777305603, "num_tokens": 447554747.0, "step": 11729 }, { "epoch": 1.4921765678666836, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.8701171875, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8640103340148926, "num_tokens": 447594426.0, "step": 11730 }, { "epoch": 1.4923037781452742, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.9769287109375, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8542534708976746, "num_tokens": 447634786.0, "step": 11731 }, { "epoch": 1.4924309884238647, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.05527114868164, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.857785701751709, "num_tokens": 447668682.0, "step": 11732 }, { "epoch": 1.4925581987024552, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.768333435058594, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8500581383705139, "num_tokens": 447705566.0, "step": 11733 }, { "epoch": 1.4926854089810457, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.32707214355469, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8572120666503906, "num_tokens": 447746385.0, "step": 11734 }, { "epoch": 1.4928126192596363, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.37284851074219, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8640100955963135, "num_tokens": 447790483.0, "step": 11735 }, { "epoch": 1.4929398295382268, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.631778717041016, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.85831218957901, "num_tokens": 447826872.0, "step": 11736 }, { "epoch": 1.4930670398168173, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.14056396484375, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8671208024024963, "num_tokens": 447864071.0, "step": 11737 }, { "epoch": 1.4931942500954076, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.596317291259766, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8440804481506348, "num_tokens": 447903320.0, "step": 11738 }, { "epoch": 1.4933214603739982, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.28656768798828, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8745720386505127, "num_tokens": 447940253.0, "step": 11739 }, { "epoch": 1.4934486706525887, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.32210922241211, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8659581542015076, "num_tokens": 447978277.0, "step": 11740 }, { "epoch": 1.4935758809311792, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.76454162597656, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8502668142318726, "num_tokens": 448018798.0, "step": 11741 }, { "epoch": 1.4937030912097697, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.37031936645508, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8497360348701477, "num_tokens": 448057469.0, "step": 11742 }, { "epoch": 1.4938303014883603, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.97533416748047, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8473336696624756, "num_tokens": 448089032.0, "step": 11743 }, { "epoch": 1.4939575117669508, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.269859313964844, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8557471036911011, "num_tokens": 448122438.0, "step": 11744 }, { "epoch": 1.4940847220455413, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.63707733154297, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8603271245956421, "num_tokens": 448166844.0, "step": 11745 }, { "epoch": 1.4942119323241319, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.16130065917969, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8512369990348816, "num_tokens": 448197766.0, "step": 11746 }, { "epoch": 1.4943391426027224, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.75833511352539, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8471805453300476, "num_tokens": 448240850.0, "step": 11747 }, { "epoch": 1.4944663528813127, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.272789001464844, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8703519701957703, "num_tokens": 448274187.0, "step": 11748 }, { "epoch": 1.4945935631599032, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.925662994384766, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8667665123939514, "num_tokens": 448311069.0, "step": 11749 }, { "epoch": 1.4947207734384937, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.27388000488281, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.859930157661438, "num_tokens": 448351536.0, "step": 11750 }, { "epoch": 1.4948479837170843, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.02598190307617, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.858662486076355, "num_tokens": 448385108.0, "step": 11751 }, { "epoch": 1.4949751939956748, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.009944915771484, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8710433840751648, "num_tokens": 448423272.0, "step": 11752 }, { "epoch": 1.4951024042742653, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.55781173706055, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8657172918319702, "num_tokens": 448462904.0, "step": 11753 }, { "epoch": 1.4952296145528559, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.18263626098633, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8579204082489014, "num_tokens": 448503257.0, "step": 11754 }, { "epoch": 1.4953568248314464, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.85893249511719, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8658863306045532, "num_tokens": 448535299.0, "step": 11755 }, { "epoch": 1.495484035110037, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.4677734375, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8684642314910889, "num_tokens": 448568878.0, "step": 11756 }, { "epoch": 1.4956112453886274, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.4790153503418, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8785867691040039, "num_tokens": 448604959.0, "step": 11757 }, { "epoch": 1.495738455667218, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.63736343383789, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.872426450252533, "num_tokens": 448638487.0, "step": 11758 }, { "epoch": 1.4958656659458085, "ewc_loss": 0.1357421875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011539459228515625, "grad_norm": 40.513858795166016, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8666925430297852, "num_tokens": 448677212.0, "step": 11759 }, { "epoch": 1.495992876224399, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.52705764770508, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8546622395515442, "num_tokens": 448719654.0, "step": 11760 }, { "epoch": 1.4961200865029896, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 40.42626190185547, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8655011057853699, "num_tokens": 448760783.0, "step": 11761 }, { "epoch": 1.49624729678158, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.22517013549805, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8501462340354919, "num_tokens": 448802843.0, "step": 11762 }, { "epoch": 1.4963745070601704, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.537864685058594, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8580848574638367, "num_tokens": 448838430.0, "step": 11763 }, { "epoch": 1.496501717338761, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.62181854248047, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.852660059928894, "num_tokens": 448880450.0, "step": 11764 }, { "epoch": 1.4966289276173514, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.60025405883789, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8622880578041077, "num_tokens": 448919352.0, "step": 11765 }, { "epoch": 1.496756137895942, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.34023666381836, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8370290994644165, "num_tokens": 448965402.0, "step": 11766 }, { "epoch": 1.4968833481745325, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.69416427612305, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8721252679824829, "num_tokens": 448996983.0, "step": 11767 }, { "epoch": 1.497010558453123, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.33403396606445, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8463010191917419, "num_tokens": 449036555.0, "step": 11768 }, { "epoch": 1.4971377687317136, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.542724609375, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8727455735206604, "num_tokens": 449070545.0, "step": 11769 }, { "epoch": 1.497264979010304, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.75696563720703, "learning_rate": 1e-06, "loss": 0.6216, "mean_token_accuracy": 0.8460808992385864, "num_tokens": 449108859.0, "step": 11770 }, { "epoch": 1.4973921892888946, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011587142944335938, "grad_norm": 41.03815460205078, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.858712911605835, "num_tokens": 449145333.0, "step": 11771 }, { "epoch": 1.497519399567485, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.207618713378906, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8544502258300781, "num_tokens": 449181292.0, "step": 11772 }, { "epoch": 1.4976466098460754, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.97854995727539, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8626673221588135, "num_tokens": 449213940.0, "step": 11773 }, { "epoch": 1.497773820124666, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.99287414550781, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8720433712005615, "num_tokens": 449252487.0, "step": 11774 }, { "epoch": 1.4979010304032565, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.897186279296875, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8521013259887695, "num_tokens": 449287268.0, "step": 11775 }, { "epoch": 1.498028240681847, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.13859176635742, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8568136692047119, "num_tokens": 449326975.0, "step": 11776 }, { "epoch": 1.4981554509604376, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 41.20219039916992, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8603286147117615, "num_tokens": 449366061.0, "step": 11777 }, { "epoch": 1.498282661239028, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.22132873535156, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8694455623626709, "num_tokens": 449401763.0, "step": 11778 }, { "epoch": 1.4984098715176186, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.41967010498047, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8806650638580322, "num_tokens": 449436605.0, "step": 11779 }, { "epoch": 1.4985370817962091, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.48533248901367, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8563582897186279, "num_tokens": 449474523.0, "step": 11780 }, { "epoch": 1.4986642920747997, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.7205696105957, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8716259002685547, "num_tokens": 449513301.0, "step": 11781 }, { "epoch": 1.4987915023533902, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.10728454589844, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8581639528274536, "num_tokens": 449547826.0, "step": 11782 }, { "epoch": 1.4989187126319807, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.75969314575195, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8612317442893982, "num_tokens": 449584818.0, "step": 11783 }, { "epoch": 1.4990459229105713, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.30818176269531, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8420907258987427, "num_tokens": 449625368.0, "step": 11784 }, { "epoch": 1.4991731331891618, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.51613998413086, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8479383587837219, "num_tokens": 449666320.0, "step": 11785 }, { "epoch": 1.4993003434677523, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.375553131103516, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8531437516212463, "num_tokens": 449705993.0, "step": 11786 }, { "epoch": 1.4994275537463426, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.538917541503906, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8547576665878296, "num_tokens": 449747088.0, "step": 11787 }, { "epoch": 1.4995547640249332, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.008541107177734, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8629235625267029, "num_tokens": 449789840.0, "step": 11788 }, { "epoch": 1.4996819743035237, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.23884582519531, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.860588550567627, "num_tokens": 449830599.0, "step": 11789 }, { "epoch": 1.4998091845821142, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.61736297607422, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8646315336227417, "num_tokens": 449864476.0, "step": 11790 }, { "epoch": 1.4999363948607047, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.28791046142578, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8431749939918518, "num_tokens": 449903791.0, "step": 11791 }, { "epoch": 1.5000636051392953, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.576454162597656, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8603675961494446, "num_tokens": 449945478.0, "step": 11792 }, { "epoch": 1.5001908154178858, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.43583679199219, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8582420349121094, "num_tokens": 449986175.0, "step": 11793 }, { "epoch": 1.5003180256964763, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.53721618652344, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8576210141181946, "num_tokens": 450025537.0, "step": 11794 }, { "epoch": 1.5004452359750666, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.44657516479492, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8467291593551636, "num_tokens": 450068269.0, "step": 11795 }, { "epoch": 1.5005724462536572, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.9836311340332, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8762621879577637, "num_tokens": 450106234.0, "step": 11796 }, { "epoch": 1.5006996565322477, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.082393646240234, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.870739221572876, "num_tokens": 450141916.0, "step": 11797 }, { "epoch": 1.5008268668108382, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.15449905395508, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8460255265235901, "num_tokens": 450175427.0, "step": 11798 }, { "epoch": 1.5009540770894287, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.40576171875, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8627588748931885, "num_tokens": 450211104.0, "step": 11799 }, { "epoch": 1.5010812873680193, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.63200378417969, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8535329103469849, "num_tokens": 450244844.0, "step": 11800 }, { "epoch": 1.5012084976466098, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.38016128540039, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8591282367706299, "num_tokens": 450286094.0, "step": 11801 }, { "epoch": 1.5013357079252003, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.6202278137207, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8515542149543762, "num_tokens": 450324938.0, "step": 11802 }, { "epoch": 1.5014629182037909, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.400596618652344, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8593429923057556, "num_tokens": 450368572.0, "step": 11803 }, { "epoch": 1.5015901284823814, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.64543533325195, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8380755186080933, "num_tokens": 450410926.0, "step": 11804 }, { "epoch": 1.501717338760972, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.209285736083984, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8631985187530518, "num_tokens": 450450206.0, "step": 11805 }, { "epoch": 1.5018445490395624, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.70484161376953, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8398102521896362, "num_tokens": 450488998.0, "step": 11806 }, { "epoch": 1.501971759318153, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.91802215576172, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8710140585899353, "num_tokens": 450530476.0, "step": 11807 }, { "epoch": 1.5020989695967435, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.31438064575195, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8664915561676025, "num_tokens": 450560655.0, "step": 11808 }, { "epoch": 1.502226179875334, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.03855514526367, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8619963526725769, "num_tokens": 450601816.0, "step": 11809 }, { "epoch": 1.5023533901539246, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.14570617675781, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8621243834495544, "num_tokens": 450634182.0, "step": 11810 }, { "epoch": 1.502480600432515, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.08891677856445, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8553313612937927, "num_tokens": 450672705.0, "step": 11811 }, { "epoch": 1.5026078107111056, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.12786102294922, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8484479188919067, "num_tokens": 450705636.0, "step": 11812 }, { "epoch": 1.502735020989696, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.85231018066406, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8518004417419434, "num_tokens": 450736165.0, "step": 11813 }, { "epoch": 1.5028622312682864, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.015869140625, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8609510660171509, "num_tokens": 450776939.0, "step": 11814 }, { "epoch": 1.502989441546877, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.09425735473633, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8656308650970459, "num_tokens": 450811994.0, "step": 11815 }, { "epoch": 1.5031166518254675, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.81376647949219, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8602792024612427, "num_tokens": 450853688.0, "step": 11816 }, { "epoch": 1.503243862104058, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.32173538208008, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8601179122924805, "num_tokens": 450892100.0, "step": 11817 }, { "epoch": 1.5033710723826486, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.488162994384766, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.877332329750061, "num_tokens": 450936383.0, "step": 11818 }, { "epoch": 1.503498282661239, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.47872543334961, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8674724698066711, "num_tokens": 450965525.0, "step": 11819 }, { "epoch": 1.5036254929398294, "ewc_loss": 0.13671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001163482666015625, "grad_norm": 40.56547927856445, "learning_rate": 1e-06, "loss": 0.6078, "mean_token_accuracy": 0.8478487133979797, "num_tokens": 451005874.0, "step": 11820 }, { "epoch": 1.50375270321842, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.33324432373047, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8422636389732361, "num_tokens": 451040668.0, "step": 11821 }, { "epoch": 1.5038799134970104, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 41.114776611328125, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8570908308029175, "num_tokens": 451082747.0, "step": 11822 }, { "epoch": 1.504007123775601, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.09577560424805, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8672047853469849, "num_tokens": 451120344.0, "step": 11823 }, { "epoch": 1.5041343340541915, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.0777702331543, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8523818254470825, "num_tokens": 451156564.0, "step": 11824 }, { "epoch": 1.504261544332782, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.6041259765625, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8561467528343201, "num_tokens": 451197252.0, "step": 11825 }, { "epoch": 1.5043887546113726, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.24420928955078, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8487411141395569, "num_tokens": 451236150.0, "step": 11826 }, { "epoch": 1.504515964889963, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.09950637817383, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8564630150794983, "num_tokens": 451275751.0, "step": 11827 }, { "epoch": 1.5046431751685536, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.9864501953125, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8797934055328369, "num_tokens": 451316778.0, "step": 11828 }, { "epoch": 1.5047703854471441, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 41.093345642089844, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8600213527679443, "num_tokens": 451359921.0, "step": 11829 }, { "epoch": 1.5048975957257347, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.493099212646484, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8655440807342529, "num_tokens": 451395616.0, "step": 11830 }, { "epoch": 1.5050248060043252, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.527801513671875, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8599627017974854, "num_tokens": 451427766.0, "step": 11831 }, { "epoch": 1.5051520162829157, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011682510375976562, "grad_norm": 40.49525451660156, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8681468963623047, "num_tokens": 451463073.0, "step": 11832 }, { "epoch": 1.5052792265615063, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.741310119628906, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8797999620437622, "num_tokens": 451503039.0, "step": 11833 }, { "epoch": 1.5054064368400968, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.6748046875, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.843781054019928, "num_tokens": 451541599.0, "step": 11834 }, { "epoch": 1.5055336471186873, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.148006439208984, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8462445139884949, "num_tokens": 451579278.0, "step": 11835 }, { "epoch": 1.5056608573972778, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.10985565185547, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8630144000053406, "num_tokens": 451618591.0, "step": 11836 }, { "epoch": 1.5057880676758684, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.909019470214844, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.860824465751648, "num_tokens": 451654453.0, "step": 11837 }, { "epoch": 1.5059152779544587, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.12339782714844, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.858791708946228, "num_tokens": 451688302.0, "step": 11838 }, { "epoch": 1.5060424882330492, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.94352340698242, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8657044172286987, "num_tokens": 451727301.0, "step": 11839 }, { "epoch": 1.5061696985116397, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.37360382080078, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8450164794921875, "num_tokens": 451770471.0, "step": 11840 }, { "epoch": 1.5062969087902303, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.794593811035156, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8475860357284546, "num_tokens": 451809413.0, "step": 11841 }, { "epoch": 1.5064241190688208, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.33540344238281, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8566997051239014, "num_tokens": 451850177.0, "step": 11842 }, { "epoch": 1.5065513293474113, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.65250015258789, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8472553491592407, "num_tokens": 451887977.0, "step": 11843 }, { "epoch": 1.5066785396260016, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.17984390258789, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8706583380699158, "num_tokens": 451927403.0, "step": 11844 }, { "epoch": 1.5068057499045922, "ewc_loss": 0.1376953125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.686405181884766, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8596110343933105, "num_tokens": 451967350.0, "step": 11845 }, { "epoch": 1.5069329601831827, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.26762771606445, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8599352836608887, "num_tokens": 452008664.0, "step": 11846 }, { "epoch": 1.5070601704617732, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.891395568847656, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8650516271591187, "num_tokens": 452047303.0, "step": 11847 }, { "epoch": 1.5071873807403637, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.20891189575195, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8608150482177734, "num_tokens": 452087359.0, "step": 11848 }, { "epoch": 1.5073145910189543, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.77154541015625, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8466418981552124, "num_tokens": 452129426.0, "step": 11849 }, { "epoch": 1.5074418012975448, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.29678726196289, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8646152019500732, "num_tokens": 452169134.0, "step": 11850 }, { "epoch": 1.5075690115761353, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.10139846801758, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8509305715560913, "num_tokens": 452209962.0, "step": 11851 }, { "epoch": 1.5076962218547258, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.94436264038086, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8634083271026611, "num_tokens": 452245978.0, "step": 11852 }, { "epoch": 1.5078234321333164, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.07079315185547, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8689244985580444, "num_tokens": 452281013.0, "step": 11853 }, { "epoch": 1.507950642411907, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.87122344970703, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8739619255065918, "num_tokens": 452317174.0, "step": 11854 }, { "epoch": 1.5080778526904974, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.06406784057617, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8570625185966492, "num_tokens": 452362974.0, "step": 11855 }, { "epoch": 1.508205062969088, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.173484802246094, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8713349103927612, "num_tokens": 452400701.0, "step": 11856 }, { "epoch": 1.5083322732476785, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.06766891479492, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8691951036453247, "num_tokens": 452438541.0, "step": 11857 }, { "epoch": 1.508459483526269, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.950592041015625, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8545598983764648, "num_tokens": 452476918.0, "step": 11858 }, { "epoch": 1.5085866938048595, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.50254440307617, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8648734092712402, "num_tokens": 452509231.0, "step": 11859 }, { "epoch": 1.50871390408345, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.91444396972656, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8547289371490479, "num_tokens": 452547079.0, "step": 11860 }, { "epoch": 1.5088411143620406, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.39822769165039, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8705759048461914, "num_tokens": 452580215.0, "step": 11861 }, { "epoch": 1.508968324640631, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.2722282409668, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8662968873977661, "num_tokens": 452615778.0, "step": 11862 }, { "epoch": 1.5090955349192214, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.81250762939453, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8621219396591187, "num_tokens": 452652184.0, "step": 11863 }, { "epoch": 1.509222745197812, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.06705856323242, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.845878005027771, "num_tokens": 452691345.0, "step": 11864 }, { "epoch": 1.5093499554764025, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.77503967285156, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8618941307067871, "num_tokens": 452729982.0, "step": 11865 }, { "epoch": 1.509477165754993, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.925174713134766, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8570128679275513, "num_tokens": 452771272.0, "step": 11866 }, { "epoch": 1.5096043760335836, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.8869514465332, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8564125299453735, "num_tokens": 452812231.0, "step": 11867 }, { "epoch": 1.509731586312174, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.82672882080078, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8489532470703125, "num_tokens": 452858406.0, "step": 11868 }, { "epoch": 1.5098587965907644, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.1543083190918, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8511847853660583, "num_tokens": 452898793.0, "step": 11869 }, { "epoch": 1.509986006869355, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.668479919433594, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.865446925163269, "num_tokens": 452937929.0, "step": 11870 }, { "epoch": 1.5101132171479454, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.92791748046875, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8638893961906433, "num_tokens": 452975436.0, "step": 11871 }, { "epoch": 1.510240427426536, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.0589599609375, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8595215082168579, "num_tokens": 453016334.0, "step": 11872 }, { "epoch": 1.5103676377051265, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.7778434753418, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8637939691543579, "num_tokens": 453053630.0, "step": 11873 }, { "epoch": 1.510494847983717, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.31367492675781, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8418022394180298, "num_tokens": 453092341.0, "step": 11874 }, { "epoch": 1.5106220582623076, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.84771728515625, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8679469227790833, "num_tokens": 453126859.0, "step": 11875 }, { "epoch": 1.510749268540898, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.1422119140625, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8564350605010986, "num_tokens": 453167327.0, "step": 11876 }, { "epoch": 1.5108764788194886, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.84882736206055, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8651787042617798, "num_tokens": 453205901.0, "step": 11877 }, { "epoch": 1.5110036890980791, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.681800842285156, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8579716086387634, "num_tokens": 453241856.0, "step": 11878 }, { "epoch": 1.5111308993766697, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.3074951171875, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.86782306432724, "num_tokens": 453282528.0, "step": 11879 }, { "epoch": 1.5112581096552602, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.518341064453125, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8497461080551147, "num_tokens": 453318762.0, "step": 11880 }, { "epoch": 1.5113853199338507, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.244232177734375, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.874010443687439, "num_tokens": 453352482.0, "step": 11881 }, { "epoch": 1.5115125302124413, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.63957214355469, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8600054979324341, "num_tokens": 453393692.0, "step": 11882 }, { "epoch": 1.5116397404910318, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.126590728759766, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8636050820350647, "num_tokens": 453429217.0, "step": 11883 }, { "epoch": 1.5117669507696223, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.12601852416992, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8446253538131714, "num_tokens": 453468190.0, "step": 11884 }, { "epoch": 1.5118941610482128, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.242977142333984, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8543031811714172, "num_tokens": 453509893.0, "step": 11885 }, { "epoch": 1.5120213713268034, "ewc_loss": 0.140625, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.96760940551758, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8633359670639038, "num_tokens": 453546972.0, "step": 11886 }, { "epoch": 1.5121485816053937, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.0612678527832, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8716864585876465, "num_tokens": 453578480.0, "step": 11887 }, { "epoch": 1.5122757918839842, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.215309143066406, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8607615232467651, "num_tokens": 453623248.0, "step": 11888 }, { "epoch": 1.5124030021625747, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.99173355102539, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8809539675712585, "num_tokens": 453660088.0, "step": 11889 }, { "epoch": 1.5125302124411653, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.87763595581055, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8632704019546509, "num_tokens": 453698931.0, "step": 11890 }, { "epoch": 1.5126574227197558, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.03849792480469, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8626406192779541, "num_tokens": 453734220.0, "step": 11891 }, { "epoch": 1.5127846329983463, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.038299560546875, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8593254089355469, "num_tokens": 453774056.0, "step": 11892 }, { "epoch": 1.5129118432769366, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.105464935302734, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8663657903671265, "num_tokens": 453806616.0, "step": 11893 }, { "epoch": 1.5130390535555271, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.308921813964844, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8692548274993896, "num_tokens": 453843838.0, "step": 11894 }, { "epoch": 1.5131662638341177, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.138980865478516, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8670507669448853, "num_tokens": 453876888.0, "step": 11895 }, { "epoch": 1.5132934741127082, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.93334197998047, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8749638199806213, "num_tokens": 453915334.0, "step": 11896 }, { "epoch": 1.5134206843912987, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.37217330932617, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8706222772598267, "num_tokens": 453950590.0, "step": 11897 }, { "epoch": 1.5135478946698893, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.93121337890625, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8639829158782959, "num_tokens": 453992590.0, "step": 11898 }, { "epoch": 1.5136751049484798, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.37592315673828, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8651927709579468, "num_tokens": 454038619.0, "step": 11899 }, { "epoch": 1.5138023152270703, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.79478073120117, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8496351838111877, "num_tokens": 454081564.0, "step": 11900 }, { "epoch": 1.5139295255056608, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.207183837890625, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8486059308052063, "num_tokens": 454125581.0, "step": 11901 }, { "epoch": 1.5140567357842514, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.120582580566406, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8693203926086426, "num_tokens": 454163961.0, "step": 11902 }, { "epoch": 1.514183946062842, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.33038330078125, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8525589108467102, "num_tokens": 454200410.0, "step": 11903 }, { "epoch": 1.5143111563414324, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.03767776489258, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8474606275558472, "num_tokens": 454237143.0, "step": 11904 }, { "epoch": 1.514438366620023, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.14325714111328, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8676981925964355, "num_tokens": 454274022.0, "step": 11905 }, { "epoch": 1.5145655768986135, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.43000411987305, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8544886708259583, "num_tokens": 454307955.0, "step": 11906 }, { "epoch": 1.514692787177204, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.329689025878906, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8691734075546265, "num_tokens": 454345449.0, "step": 11907 }, { "epoch": 1.5148199974557945, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.19317626953125, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8483418226242065, "num_tokens": 454383096.0, "step": 11908 }, { "epoch": 1.514947207734385, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.1763916015625, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8482967019081116, "num_tokens": 454420142.0, "step": 11909 }, { "epoch": 1.5150744180129756, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.3069953918457, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8601347208023071, "num_tokens": 454454991.0, "step": 11910 }, { "epoch": 1.515201628291566, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.841285705566406, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8574730753898621, "num_tokens": 454490161.0, "step": 11911 }, { "epoch": 1.5153288385701564, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.981319427490234, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8511296510696411, "num_tokens": 454534767.0, "step": 11912 }, { "epoch": 1.515456048848747, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.834686279296875, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8696005344390869, "num_tokens": 454573246.0, "step": 11913 }, { "epoch": 1.5155832591273375, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.118255615234375, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8737078309059143, "num_tokens": 454607626.0, "step": 11914 }, { "epoch": 1.515710469405928, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.66411590576172, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8533132672309875, "num_tokens": 454639840.0, "step": 11915 }, { "epoch": 1.5158376796845185, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.03795623779297, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8524260520935059, "num_tokens": 454681445.0, "step": 11916 }, { "epoch": 1.515964889963109, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.496761322021484, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8544440269470215, "num_tokens": 454726191.0, "step": 11917 }, { "epoch": 1.5160921002416994, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.22014617919922, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8671694993972778, "num_tokens": 454765112.0, "step": 11918 }, { "epoch": 1.51621931052029, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.6419792175293, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8579853177070618, "num_tokens": 454805254.0, "step": 11919 }, { "epoch": 1.5163465207988804, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.96684265136719, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8534219264984131, "num_tokens": 454841344.0, "step": 11920 }, { "epoch": 1.516473731077471, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.84552764892578, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8734754323959351, "num_tokens": 454879986.0, "step": 11921 }, { "epoch": 1.5166009413560615, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.03116989135742, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8519049882888794, "num_tokens": 454917731.0, "step": 11922 }, { "epoch": 1.516728151634652, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.607852935791016, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8656242489814758, "num_tokens": 454952837.0, "step": 11923 }, { "epoch": 1.5168553619132426, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.15001678466797, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8625138401985168, "num_tokens": 454992751.0, "step": 11924 }, { "epoch": 1.516982572191833, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.902645111083984, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8593149185180664, "num_tokens": 455029829.0, "step": 11925 }, { "epoch": 1.5171097824704236, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.85478591918945, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8594712615013123, "num_tokens": 455072167.0, "step": 11926 }, { "epoch": 1.5172369927490141, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.00236892700195, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8675367832183838, "num_tokens": 455108433.0, "step": 11927 }, { "epoch": 1.5173642030276047, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.91646194458008, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8637202978134155, "num_tokens": 455148943.0, "step": 11928 }, { "epoch": 1.5174914133061952, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.0478630065918, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.856080949306488, "num_tokens": 455185099.0, "step": 11929 }, { "epoch": 1.5176186235847857, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.7869873046875, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8751718997955322, "num_tokens": 455225322.0, "step": 11930 }, { "epoch": 1.5177458338633762, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.07211685180664, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8577166795730591, "num_tokens": 455262830.0, "step": 11931 }, { "epoch": 1.5178730441419668, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.77397537231445, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8679671883583069, "num_tokens": 455300825.0, "step": 11932 }, { "epoch": 1.5180002544205573, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.063045501708984, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8557609915733337, "num_tokens": 455336209.0, "step": 11933 }, { "epoch": 1.5181274646991478, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.86479949951172, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8581730723381042, "num_tokens": 455374909.0, "step": 11934 }, { "epoch": 1.5182546749777384, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.83491134643555, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8738943934440613, "num_tokens": 455417179.0, "step": 11935 }, { "epoch": 1.5183818852563287, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.926124572753906, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8468358516693115, "num_tokens": 455457923.0, "step": 11936 }, { "epoch": 1.5185090955349192, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.9803581237793, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8515582084655762, "num_tokens": 455492932.0, "step": 11937 }, { "epoch": 1.5186363058135097, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.98151397705078, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8506699800491333, "num_tokens": 455540392.0, "step": 11938 }, { "epoch": 1.5187635160921003, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.15157699584961, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8556420803070068, "num_tokens": 455574461.0, "step": 11939 }, { "epoch": 1.5188907263706908, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.091163635253906, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8579469919204712, "num_tokens": 455608315.0, "step": 11940 }, { "epoch": 1.5190179366492813, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.25139236450195, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8643666505813599, "num_tokens": 455654927.0, "step": 11941 }, { "epoch": 1.5191451469278716, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.16175842285156, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8489662408828735, "num_tokens": 455695745.0, "step": 11942 }, { "epoch": 1.5192723572064621, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.296783447265625, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8593634963035583, "num_tokens": 455734652.0, "step": 11943 }, { "epoch": 1.5193995674850527, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.859947204589844, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8374428153038025, "num_tokens": 455776063.0, "step": 11944 }, { "epoch": 1.5195267777636432, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.14666748046875, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8542879223823547, "num_tokens": 455820595.0, "step": 11945 }, { "epoch": 1.5196539880422337, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.22231674194336, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8659871220588684, "num_tokens": 455859553.0, "step": 11946 }, { "epoch": 1.5197811983208243, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.3515739440918, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8616801500320435, "num_tokens": 455892676.0, "step": 11947 }, { "epoch": 1.5199084085994148, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.5628547668457, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8520457148551941, "num_tokens": 455932106.0, "step": 11948 }, { "epoch": 1.5200356188780053, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.45448684692383, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8652267456054688, "num_tokens": 455971555.0, "step": 11949 }, { "epoch": 1.5201628291565958, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.49587631225586, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8546429872512817, "num_tokens": 456014177.0, "step": 11950 }, { "epoch": 1.5202900394351864, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.64251708984375, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.863776683807373, "num_tokens": 456048229.0, "step": 11951 }, { "epoch": 1.520417249713777, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.54669189453125, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8785661458969116, "num_tokens": 456085815.0, "step": 11952 }, { "epoch": 1.5205444599923674, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.7142448425293, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8620629906654358, "num_tokens": 456128971.0, "step": 11953 }, { "epoch": 1.520671670270958, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.487789154052734, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8424956202507019, "num_tokens": 456159705.0, "step": 11954 }, { "epoch": 1.5207988805495485, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.63276290893555, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8502675890922546, "num_tokens": 456198251.0, "step": 11955 }, { "epoch": 1.520926090828139, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.67076873779297, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.852762758731842, "num_tokens": 456237612.0, "step": 11956 }, { "epoch": 1.5210533011067295, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.45112228393555, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8588159680366516, "num_tokens": 456273665.0, "step": 11957 }, { "epoch": 1.52118051138532, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.88584518432617, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8616008758544922, "num_tokens": 456305467.0, "step": 11958 }, { "epoch": 1.5213077216639106, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.356895446777344, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8658573031425476, "num_tokens": 456342951.0, "step": 11959 }, { "epoch": 1.521434931942501, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.882843017578125, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.859353244304657, "num_tokens": 456375846.0, "step": 11960 }, { "epoch": 1.5215621422210914, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.33644485473633, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8680493235588074, "num_tokens": 456407993.0, "step": 11961 }, { "epoch": 1.521689352499682, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.08430099487305, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8409544229507446, "num_tokens": 456444278.0, "step": 11962 }, { "epoch": 1.5218165627782725, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.91055679321289, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8675417900085449, "num_tokens": 456483950.0, "step": 11963 }, { "epoch": 1.521943773056863, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.06898498535156, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8613318800926208, "num_tokens": 456527438.0, "step": 11964 }, { "epoch": 1.5220709833354535, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.08385467529297, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8712020516395569, "num_tokens": 456560572.0, "step": 11965 }, { "epoch": 1.5221981936140438, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.15840148925781, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8496378064155579, "num_tokens": 456604781.0, "step": 11966 }, { "epoch": 1.5223254038926344, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.28804397583008, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.873186469078064, "num_tokens": 456639415.0, "step": 11967 }, { "epoch": 1.522452614171225, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.120426177978516, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8735301494598389, "num_tokens": 456671183.0, "step": 11968 }, { "epoch": 1.5225798244498154, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.28190231323242, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8471958637237549, "num_tokens": 456712995.0, "step": 11969 }, { "epoch": 1.522707034728406, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.61517333984375, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8546552658081055, "num_tokens": 456750587.0, "step": 11970 }, { "epoch": 1.5228342450069965, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.90461349487305, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8672550916671753, "num_tokens": 456787503.0, "step": 11971 }, { "epoch": 1.522961455285587, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.6547966003418, "learning_rate": 1e-06, "loss": 0.6245, "mean_token_accuracy": 0.8375389575958252, "num_tokens": 456831645.0, "step": 11972 }, { "epoch": 1.5230886655641775, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.92578887939453, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8664358854293823, "num_tokens": 456869143.0, "step": 11973 }, { "epoch": 1.523215875842768, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.75304412841797, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8579977750778198, "num_tokens": 456906465.0, "step": 11974 }, { "epoch": 1.5233430861213586, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.3546257019043, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8598272800445557, "num_tokens": 456942048.0, "step": 11975 }, { "epoch": 1.5234702963999491, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.290977478027344, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8773279786109924, "num_tokens": 456975710.0, "step": 11976 }, { "epoch": 1.5235975066785397, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.01325607299805, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8479047417640686, "num_tokens": 457022295.0, "step": 11977 }, { "epoch": 1.5237247169571302, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.551700592041016, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8522202968597412, "num_tokens": 457060433.0, "step": 11978 }, { "epoch": 1.5238519272357207, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.94397735595703, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8672073483467102, "num_tokens": 457101760.0, "step": 11979 }, { "epoch": 1.5239791375143112, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.58811950683594, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8757033348083496, "num_tokens": 457142676.0, "step": 11980 }, { "epoch": 1.5241063477929018, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.324737548828125, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8668019771575928, "num_tokens": 457179507.0, "step": 11981 }, { "epoch": 1.5242335580714923, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.70480728149414, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8615952730178833, "num_tokens": 457221563.0, "step": 11982 }, { "epoch": 1.5243607683500828, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.85641860961914, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8582586050033569, "num_tokens": 457258634.0, "step": 11983 }, { "epoch": 1.5244879786286734, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 40.09279251098633, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.87397301197052, "num_tokens": 457293097.0, "step": 11984 }, { "epoch": 1.5246151889072637, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.91932678222656, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8749656081199646, "num_tokens": 457333525.0, "step": 11985 }, { "epoch": 1.5247423991858542, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011777877807617188, "grad_norm": 40.45124816894531, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8471084833145142, "num_tokens": 457379772.0, "step": 11986 }, { "epoch": 1.5248696094644447, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.7128791809082, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8659334182739258, "num_tokens": 457415852.0, "step": 11987 }, { "epoch": 1.5249968197430352, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.392723083496094, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8653753995895386, "num_tokens": 457449771.0, "step": 11988 }, { "epoch": 1.5251240300216258, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.845481872558594, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.855565071105957, "num_tokens": 457489601.0, "step": 11989 }, { "epoch": 1.5252512403002163, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.351951599121094, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.87464439868927, "num_tokens": 457530516.0, "step": 11990 }, { "epoch": 1.5253784505788066, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.89453887939453, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8572572469711304, "num_tokens": 457568299.0, "step": 11991 }, { "epoch": 1.5255056608573971, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.5971565246582, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8582768440246582, "num_tokens": 457600476.0, "step": 11992 }, { "epoch": 1.5256328711359877, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.97325897216797, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.88096684217453, "num_tokens": 457632024.0, "step": 11993 }, { "epoch": 1.5257600814145782, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.04008483886719, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8669518232345581, "num_tokens": 457670805.0, "step": 11994 }, { "epoch": 1.5258872916931687, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.18026351928711, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8623005747795105, "num_tokens": 457702468.0, "step": 11995 }, { "epoch": 1.5260145019717593, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.34074401855469, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8708825707435608, "num_tokens": 457740443.0, "step": 11996 }, { "epoch": 1.5261417122503498, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.0555419921875, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8631812334060669, "num_tokens": 457775833.0, "step": 11997 }, { "epoch": 1.5262689225289403, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.79706954956055, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.864862322807312, "num_tokens": 457815850.0, "step": 11998 }, { "epoch": 1.5263961328075308, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.99834442138672, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8675124645233154, "num_tokens": 457854946.0, "step": 11999 }, { "epoch": 1.5265233430861214, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.10312271118164, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8658127188682556, "num_tokens": 457890775.0, "step": 12000 }, { "epoch": 1.526650553364712, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.9081916809082, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8655341267585754, "num_tokens": 457928143.0, "step": 12001 }, { "epoch": 1.5267777636433024, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.02925109863281, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8679777383804321, "num_tokens": 457965999.0, "step": 12002 }, { "epoch": 1.526904973921893, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.00589370727539, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8592777252197266, "num_tokens": 458006638.0, "step": 12003 }, { "epoch": 1.5270321842004835, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.45753860473633, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8619064092636108, "num_tokens": 458047522.0, "step": 12004 }, { "epoch": 1.527159394479074, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.1774787902832, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8616151809692383, "num_tokens": 458091219.0, "step": 12005 }, { "epoch": 1.5272866047576645, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.117828369140625, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8483328819274902, "num_tokens": 458129208.0, "step": 12006 }, { "epoch": 1.527413815036255, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.60068130493164, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8567273616790771, "num_tokens": 458163632.0, "step": 12007 }, { "epoch": 1.5275410253148456, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.575740814208984, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8549450635910034, "num_tokens": 458202377.0, "step": 12008 }, { "epoch": 1.527668235593436, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.50272750854492, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8598482608795166, "num_tokens": 458239901.0, "step": 12009 }, { "epoch": 1.5277954458720264, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.501033782958984, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8536611199378967, "num_tokens": 458280291.0, "step": 12010 }, { "epoch": 1.527922656150617, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.160438537597656, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8669016361236572, "num_tokens": 458316931.0, "step": 12011 }, { "epoch": 1.5280498664292075, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.70140075683594, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8559727668762207, "num_tokens": 458356477.0, "step": 12012 }, { "epoch": 1.528177076707798, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.07503890991211, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8464696407318115, "num_tokens": 458401340.0, "step": 12013 }, { "epoch": 1.5283042869863885, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.53620910644531, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8667699098587036, "num_tokens": 458436190.0, "step": 12014 }, { "epoch": 1.5284314972649788, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.166133880615234, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8596880435943604, "num_tokens": 458465513.0, "step": 12015 }, { "epoch": 1.5285587075435694, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.9224967956543, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8581892848014832, "num_tokens": 458505465.0, "step": 12016 }, { "epoch": 1.52868591782216, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.505393981933594, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8604801893234253, "num_tokens": 458545087.0, "step": 12017 }, { "epoch": 1.5288131281007504, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.59562301635742, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8691987991333008, "num_tokens": 458583274.0, "step": 12018 }, { "epoch": 1.528940338379341, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.44650650024414, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8565781116485596, "num_tokens": 458626026.0, "step": 12019 }, { "epoch": 1.5290675486579315, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.45269775390625, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.873704195022583, "num_tokens": 458668774.0, "step": 12020 }, { "epoch": 1.529194758936522, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.5367431640625, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8647295236587524, "num_tokens": 458702256.0, "step": 12021 }, { "epoch": 1.5293219692151125, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.64552307128906, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8336788415908813, "num_tokens": 458741429.0, "step": 12022 }, { "epoch": 1.529449179493703, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.601829528808594, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8651810884475708, "num_tokens": 458775608.0, "step": 12023 }, { "epoch": 1.5295763897722936, "ewc_loss": 0.140625, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.65353012084961, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8655821681022644, "num_tokens": 458811451.0, "step": 12024 }, { "epoch": 1.5297036000508841, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.44342803955078, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8657430410385132, "num_tokens": 458851761.0, "step": 12025 }, { "epoch": 1.5298308103294747, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.7880859375, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.854254961013794, "num_tokens": 458888948.0, "step": 12026 }, { "epoch": 1.5299580206080652, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.47892379760742, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8591808080673218, "num_tokens": 458922466.0, "step": 12027 }, { "epoch": 1.5300852308866557, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.0164794921875, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8570675849914551, "num_tokens": 458956947.0, "step": 12028 }, { "epoch": 1.5302124411652462, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.5532341003418, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.879163384437561, "num_tokens": 458993634.0, "step": 12029 }, { "epoch": 1.5303396514438368, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.934242248535156, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8713907599449158, "num_tokens": 459034314.0, "step": 12030 }, { "epoch": 1.5304668617224273, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.27449035644531, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8670393824577332, "num_tokens": 459072117.0, "step": 12031 }, { "epoch": 1.5305940720010178, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.73149490356445, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8558591604232788, "num_tokens": 459112382.0, "step": 12032 }, { "epoch": 1.5307212822796084, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.00991439819336, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8744037747383118, "num_tokens": 459148973.0, "step": 12033 }, { "epoch": 1.5308484925581987, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.99213409423828, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.858834981918335, "num_tokens": 459183242.0, "step": 12034 }, { "epoch": 1.5309757028367892, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.832035064697266, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8795858025550842, "num_tokens": 459219606.0, "step": 12035 }, { "epoch": 1.5311029131153797, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.05618667602539, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8796875476837158, "num_tokens": 459253763.0, "step": 12036 }, { "epoch": 1.5312301233939702, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.14131164550781, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8636335730552673, "num_tokens": 459291187.0, "step": 12037 }, { "epoch": 1.5313573336725608, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.31325149536133, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8580285310745239, "num_tokens": 459327423.0, "step": 12038 }, { "epoch": 1.5314845439511513, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 40.95951843261719, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8569036722183228, "num_tokens": 459366207.0, "step": 12039 }, { "epoch": 1.5316117542297416, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.12934875488281, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8534483909606934, "num_tokens": 459408338.0, "step": 12040 }, { "epoch": 1.5317389645083321, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.964847564697266, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8725281953811646, "num_tokens": 459443420.0, "step": 12041 }, { "epoch": 1.5318661747869227, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.00925827026367, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.844654381275177, "num_tokens": 459475835.0, "step": 12042 }, { "epoch": 1.5319933850655132, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.397647857666016, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8591000437736511, "num_tokens": 459511993.0, "step": 12043 }, { "epoch": 1.5321205953441037, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.64515686035156, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.869830846786499, "num_tokens": 459546221.0, "step": 12044 }, { "epoch": 1.5322478056226942, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.491573333740234, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8659003973007202, "num_tokens": 459587271.0, "step": 12045 }, { "epoch": 1.5323750159012848, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.600711822509766, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8502165079116821, "num_tokens": 459623388.0, "step": 12046 }, { "epoch": 1.5325022261798753, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.57002639770508, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8620051145553589, "num_tokens": 459666440.0, "step": 12047 }, { "epoch": 1.5326294364584658, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.49097442626953, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8479804396629333, "num_tokens": 459708373.0, "step": 12048 }, { "epoch": 1.5327566467370564, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7640266418457, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8716391324996948, "num_tokens": 459746707.0, "step": 12049 }, { "epoch": 1.532883857015647, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.529422760009766, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8672565221786499, "num_tokens": 459792493.0, "step": 12050 }, { "epoch": 1.5330110672942374, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.71009063720703, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8696317672729492, "num_tokens": 459828897.0, "step": 12051 }, { "epoch": 1.533138277572828, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.84010314941406, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8529758453369141, "num_tokens": 459866274.0, "step": 12052 }, { "epoch": 1.5332654878514185, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.03096008300781, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8767298460006714, "num_tokens": 459902442.0, "step": 12053 }, { "epoch": 1.533392698130009, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.21044921875, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8696277141571045, "num_tokens": 459943331.0, "step": 12054 }, { "epoch": 1.5335199084085995, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.16828536987305, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8673821687698364, "num_tokens": 459975349.0, "step": 12055 }, { "epoch": 1.53364711868719, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.92558288574219, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8749092817306519, "num_tokens": 460008295.0, "step": 12056 }, { "epoch": 1.5337743289657806, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.218711853027344, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8657017946243286, "num_tokens": 460045892.0, "step": 12057 }, { "epoch": 1.533901539244371, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.00749206542969, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8627669811248779, "num_tokens": 460084255.0, "step": 12058 }, { "epoch": 1.5340287495229614, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.14918899536133, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8469412922859192, "num_tokens": 460123706.0, "step": 12059 }, { "epoch": 1.534155959801552, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.45185470581055, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8477694988250732, "num_tokens": 460162351.0, "step": 12060 }, { "epoch": 1.5342831700801425, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.736228942871094, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8524322509765625, "num_tokens": 460200135.0, "step": 12061 }, { "epoch": 1.534410380358733, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.2091064453125, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8558419942855835, "num_tokens": 460243585.0, "step": 12062 }, { "epoch": 1.5345375906373235, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.052005767822266, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8540281653404236, "num_tokens": 460283695.0, "step": 12063 }, { "epoch": 1.5346648009159138, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.29899597167969, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8700569868087769, "num_tokens": 460317241.0, "step": 12064 }, { "epoch": 1.5347920111945044, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.23078536987305, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8641483187675476, "num_tokens": 460351900.0, "step": 12065 }, { "epoch": 1.534919221473095, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.23073196411133, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8633514642715454, "num_tokens": 460391672.0, "step": 12066 }, { "epoch": 1.5350464317516854, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.286781311035156, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.852761447429657, "num_tokens": 460434120.0, "step": 12067 }, { "epoch": 1.535173642030276, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.04064178466797, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8639161586761475, "num_tokens": 460474214.0, "step": 12068 }, { "epoch": 1.5353008523088665, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.508399963378906, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8665506839752197, "num_tokens": 460507549.0, "step": 12069 }, { "epoch": 1.535428062587457, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.5897102355957, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8542207479476929, "num_tokens": 460546391.0, "step": 12070 }, { "epoch": 1.5355552728660475, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.6377067565918, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.851202666759491, "num_tokens": 460585232.0, "step": 12071 }, { "epoch": 1.535682483144638, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.54233932495117, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.869135856628418, "num_tokens": 460624398.0, "step": 12072 }, { "epoch": 1.5358096934232286, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.60087585449219, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8532620072364807, "num_tokens": 460667193.0, "step": 12073 }, { "epoch": 1.5359369037018191, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.61872100830078, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8379961252212524, "num_tokens": 460710614.0, "step": 12074 }, { "epoch": 1.5360641139804097, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.029048919677734, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8578248023986816, "num_tokens": 460751079.0, "step": 12075 }, { "epoch": 1.5361913242590002, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.44830322265625, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8616360425949097, "num_tokens": 460786768.0, "step": 12076 }, { "epoch": 1.5363185345375907, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.87335205078125, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8498279452323914, "num_tokens": 460824757.0, "step": 12077 }, { "epoch": 1.5364457448161812, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.101829528808594, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.854350745677948, "num_tokens": 460865024.0, "step": 12078 }, { "epoch": 1.5365729550947718, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.658382415771484, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8521500825881958, "num_tokens": 460900895.0, "step": 12079 }, { "epoch": 1.5367001653733623, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.575042724609375, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8662869930267334, "num_tokens": 460936876.0, "step": 12080 }, { "epoch": 1.5368273756519528, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.30103302001953, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8444404602050781, "num_tokens": 460975297.0, "step": 12081 }, { "epoch": 1.5369545859305433, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.41086196899414, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8490419387817383, "num_tokens": 461008603.0, "step": 12082 }, { "epoch": 1.5370817962091337, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.67354202270508, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8519480228424072, "num_tokens": 461047770.0, "step": 12083 }, { "epoch": 1.5372090064877242, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.758453369140625, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8484505414962769, "num_tokens": 461088554.0, "step": 12084 }, { "epoch": 1.5373362167663147, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.65081024169922, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8385895490646362, "num_tokens": 461126070.0, "step": 12085 }, { "epoch": 1.5374634270449052, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.764644622802734, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8568885326385498, "num_tokens": 461166394.0, "step": 12086 }, { "epoch": 1.5375906373234958, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.58003234863281, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.85423743724823, "num_tokens": 461208263.0, "step": 12087 }, { "epoch": 1.5377178476020863, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.70751190185547, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8715387582778931, "num_tokens": 461245054.0, "step": 12088 }, { "epoch": 1.5378450578806766, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.6868782043457, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8501251935958862, "num_tokens": 461281584.0, "step": 12089 }, { "epoch": 1.5379722681592671, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.41701889038086, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8703399300575256, "num_tokens": 461322247.0, "step": 12090 }, { "epoch": 1.5380994784378577, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.11250305175781, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8478618264198303, "num_tokens": 461358371.0, "step": 12091 }, { "epoch": 1.5382266887164482, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.985931396484375, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8671911954879761, "num_tokens": 461394666.0, "step": 12092 }, { "epoch": 1.5383538989950387, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.827430725097656, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8726791143417358, "num_tokens": 461430875.0, "step": 12093 }, { "epoch": 1.5384811092736292, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.37118911743164, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8564857840538025, "num_tokens": 461467412.0, "step": 12094 }, { "epoch": 1.5386083195522198, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 40.83619689941406, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8390092849731445, "num_tokens": 461501970.0, "step": 12095 }, { "epoch": 1.5387355298308103, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.77385711669922, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8472951054573059, "num_tokens": 461542735.0, "step": 12096 }, { "epoch": 1.5388627401094008, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.27656173706055, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8620938658714294, "num_tokens": 461583883.0, "step": 12097 }, { "epoch": 1.5389899503879914, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.154052734375, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8520323038101196, "num_tokens": 461625124.0, "step": 12098 }, { "epoch": 1.5391171606665819, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.377071380615234, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8508644104003906, "num_tokens": 461660273.0, "step": 12099 }, { "epoch": 1.5392443709451724, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.274078369140625, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8547486066818237, "num_tokens": 461698890.0, "step": 12100 }, { "epoch": 1.539371581223763, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.30797576904297, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8615776300430298, "num_tokens": 461736927.0, "step": 12101 }, { "epoch": 1.5394987915023535, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.46339797973633, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8557959794998169, "num_tokens": 461777796.0, "step": 12102 }, { "epoch": 1.539626001780944, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.68708419799805, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8682436943054199, "num_tokens": 461814656.0, "step": 12103 }, { "epoch": 1.5397532120595345, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.568355560302734, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8418577909469604, "num_tokens": 461852186.0, "step": 12104 }, { "epoch": 1.539880422338125, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.142852783203125, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8735790252685547, "num_tokens": 461893724.0, "step": 12105 }, { "epoch": 1.5400076326167156, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 42.16850662231445, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8479104042053223, "num_tokens": 461935380.0, "step": 12106 }, { "epoch": 1.5401348428953059, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.84598159790039, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8664823770523071, "num_tokens": 461971463.0, "step": 12107 }, { "epoch": 1.5402620531738964, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.33291244506836, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8671611547470093, "num_tokens": 462007514.0, "step": 12108 }, { "epoch": 1.540389263452487, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011730194091796875, "grad_norm": 40.8475227355957, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8738508224487305, "num_tokens": 462046022.0, "step": 12109 }, { "epoch": 1.5405164737310775, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31794738769531, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8646213412284851, "num_tokens": 462089367.0, "step": 12110 }, { "epoch": 1.540643684009668, "ewc_loss": 0.138671875, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.000118255615234375, "grad_norm": 41.00237274169922, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8647093772888184, "num_tokens": 462128558.0, "step": 12111 }, { "epoch": 1.5407708942882585, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.636741638183594, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8673926591873169, "num_tokens": 462162639.0, "step": 12112 }, { "epoch": 1.5408981045668488, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.75358581542969, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8612948060035706, "num_tokens": 462199303.0, "step": 12113 }, { "epoch": 1.5410253148454394, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.470848083496094, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8574833869934082, "num_tokens": 462246095.0, "step": 12114 }, { "epoch": 1.54115252512403, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.494537353515625, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8551667928695679, "num_tokens": 462286653.0, "step": 12115 }, { "epoch": 1.5412797354026204, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.5897102355957, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.858556866645813, "num_tokens": 462328295.0, "step": 12116 }, { "epoch": 1.541406945681211, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.09006881713867, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8615062832832336, "num_tokens": 462365175.0, "step": 12117 }, { "epoch": 1.5415341559598015, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.36918258666992, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8528117537498474, "num_tokens": 462407044.0, "step": 12118 }, { "epoch": 1.541661366238392, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.51920700073242, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8529649972915649, "num_tokens": 462443819.0, "step": 12119 }, { "epoch": 1.5417885765169825, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.80419921875, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8698756694793701, "num_tokens": 462487319.0, "step": 12120 }, { "epoch": 1.541915786795573, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.012847900390625, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8617589473724365, "num_tokens": 462527026.0, "step": 12121 }, { "epoch": 1.5420429970741636, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.45023727416992, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8537324666976929, "num_tokens": 462557552.0, "step": 12122 }, { "epoch": 1.5421702073527541, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.488731384277344, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8622151017189026, "num_tokens": 462595243.0, "step": 12123 }, { "epoch": 1.5422974176313446, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.76019287109375, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8625171184539795, "num_tokens": 462636077.0, "step": 12124 }, { "epoch": 1.5424246279099352, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.97301483154297, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8718608617782593, "num_tokens": 462668407.0, "step": 12125 }, { "epoch": 1.5425518381885257, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.69474411010742, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8603838086128235, "num_tokens": 462707678.0, "step": 12126 }, { "epoch": 1.5426790484671162, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.886051177978516, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8531349897384644, "num_tokens": 462739722.0, "step": 12127 }, { "epoch": 1.5428062587457068, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.80491638183594, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.867026150226593, "num_tokens": 462770369.0, "step": 12128 }, { "epoch": 1.5429334690242973, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.78173828125, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8664724826812744, "num_tokens": 462806126.0, "step": 12129 }, { "epoch": 1.5430606793028878, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.9660758972168, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8628308773040771, "num_tokens": 462846585.0, "step": 12130 }, { "epoch": 1.5431878895814783, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.416603088378906, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8573617935180664, "num_tokens": 462889892.0, "step": 12131 }, { "epoch": 1.5433150998600687, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.06208419799805, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.860236644744873, "num_tokens": 462927612.0, "step": 12132 }, { "epoch": 1.5434423101386592, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.21294403076172, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8651183247566223, "num_tokens": 462971661.0, "step": 12133 }, { "epoch": 1.5435695204172497, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.244606018066406, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8652487993240356, "num_tokens": 463011260.0, "step": 12134 }, { "epoch": 1.5436967306958402, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.41029357910156, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8665754199028015, "num_tokens": 463045395.0, "step": 12135 }, { "epoch": 1.5438239409744308, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.16234588623047, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8504011631011963, "num_tokens": 463081015.0, "step": 12136 }, { "epoch": 1.5439511512530213, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.567508697509766, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8586598038673401, "num_tokens": 463115658.0, "step": 12137 }, { "epoch": 1.5440783615316116, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.11772537231445, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8715678453445435, "num_tokens": 463152084.0, "step": 12138 }, { "epoch": 1.5442055718102021, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.42626953125, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8522307872772217, "num_tokens": 463190137.0, "step": 12139 }, { "epoch": 1.5443327820887927, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.14686965942383, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8609966039657593, "num_tokens": 463227626.0, "step": 12140 }, { "epoch": 1.5444599923673832, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.94968032836914, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.860828697681427, "num_tokens": 463266451.0, "step": 12141 }, { "epoch": 1.5445872026459737, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.419254302978516, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8475311398506165, "num_tokens": 463305327.0, "step": 12142 }, { "epoch": 1.5447144129245642, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.15846633911133, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8576112389564514, "num_tokens": 463345890.0, "step": 12143 }, { "epoch": 1.5448416232031548, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.061187744140625, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8649406433105469, "num_tokens": 463381583.0, "step": 12144 }, { "epoch": 1.5449688334817453, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.567325592041016, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8694248795509338, "num_tokens": 463414442.0, "step": 12145 }, { "epoch": 1.5450960437603358, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.42991256713867, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8659431338310242, "num_tokens": 463449517.0, "step": 12146 }, { "epoch": 1.5452232540389264, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 40.755348205566406, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8671967387199402, "num_tokens": 463487952.0, "step": 12147 }, { "epoch": 1.5453504643175169, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.96794509887695, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8624087572097778, "num_tokens": 463520724.0, "step": 12148 }, { "epoch": 1.5454776745961074, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.35606002807617, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8569968342781067, "num_tokens": 463564691.0, "step": 12149 }, { "epoch": 1.545604884874698, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.185733795166016, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8562069535255432, "num_tokens": 463599037.0, "step": 12150 }, { "epoch": 1.5457320951532885, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 40.5839729309082, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8464367389678955, "num_tokens": 463630744.0, "step": 12151 }, { "epoch": 1.545859305431879, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.66530990600586, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8575327396392822, "num_tokens": 463663551.0, "step": 12152 }, { "epoch": 1.5459865157104695, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.201438903808594, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8731306791305542, "num_tokens": 463702446.0, "step": 12153 }, { "epoch": 1.54611372598906, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.360897064208984, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8519852757453918, "num_tokens": 463739324.0, "step": 12154 }, { "epoch": 1.5462409362676506, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.750606536865234, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8415155410766602, "num_tokens": 463780765.0, "step": 12155 }, { "epoch": 1.5463681465462409, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.20878601074219, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8389413356781006, "num_tokens": 463817727.0, "step": 12156 }, { "epoch": 1.5464953568248314, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7109489440918, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8759850263595581, "num_tokens": 463865793.0, "step": 12157 }, { "epoch": 1.546622567103422, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.308677673339844, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8611304759979248, "num_tokens": 463908299.0, "step": 12158 }, { "epoch": 1.5467497773820125, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.557899475097656, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8568572998046875, "num_tokens": 463944513.0, "step": 12159 }, { "epoch": 1.546876987660603, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 40.915626525878906, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8541950583457947, "num_tokens": 463982862.0, "step": 12160 }, { "epoch": 1.5470041979391935, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.88827133178711, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8659731149673462, "num_tokens": 464023357.0, "step": 12161 }, { "epoch": 1.5471314082177838, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 40.892086029052734, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8597521781921387, "num_tokens": 464071889.0, "step": 12162 }, { "epoch": 1.5472586184963744, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.757774353027344, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8468078970909119, "num_tokens": 464112334.0, "step": 12163 }, { "epoch": 1.5473858287749649, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.24484634399414, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8545804023742676, "num_tokens": 464156833.0, "step": 12164 }, { "epoch": 1.5475130390535554, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.450843811035156, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8622015118598938, "num_tokens": 464192989.0, "step": 12165 }, { "epoch": 1.547640249332146, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 40.912757873535156, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8654093742370605, "num_tokens": 464232855.0, "step": 12166 }, { "epoch": 1.5477674596107365, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.70014572143555, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8437883257865906, "num_tokens": 464271860.0, "step": 12167 }, { "epoch": 1.547894669889327, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.49281311035156, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8723130226135254, "num_tokens": 464312925.0, "step": 12168 }, { "epoch": 1.5480218801679175, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.936439514160156, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8553012013435364, "num_tokens": 464349190.0, "step": 12169 }, { "epoch": 1.548149090446508, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.07741165161133, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8552132248878479, "num_tokens": 464392031.0, "step": 12170 }, { "epoch": 1.5482763007250986, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07421112060547, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8549602031707764, "num_tokens": 464424884.0, "step": 12171 }, { "epoch": 1.5484035110036891, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.351478576660156, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8549530506134033, "num_tokens": 464466047.0, "step": 12172 }, { "epoch": 1.5485307212822796, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.864994049072266, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8597621917724609, "num_tokens": 464497293.0, "step": 12173 }, { "epoch": 1.5486579315608702, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.784420013427734, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8459069728851318, "num_tokens": 464541873.0, "step": 12174 }, { "epoch": 1.5487851418394607, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.61853790283203, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.858822226524353, "num_tokens": 464579712.0, "step": 12175 }, { "epoch": 1.5489123521180512, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.947792053222656, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8586386442184448, "num_tokens": 464612277.0, "step": 12176 }, { "epoch": 1.5490395623966418, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 42.082847595214844, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8575471043586731, "num_tokens": 464647062.0, "step": 12177 }, { "epoch": 1.5491667726752323, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.48017501831055, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8629651665687561, "num_tokens": 464677998.0, "step": 12178 }, { "epoch": 1.5492939829538228, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 42.24119567871094, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.851933479309082, "num_tokens": 464719205.0, "step": 12179 }, { "epoch": 1.5494211932324133, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.23046875, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8644075989723206, "num_tokens": 464760758.0, "step": 12180 }, { "epoch": 1.5495484035110036, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.12874221801758, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8632584810256958, "num_tokens": 464797422.0, "step": 12181 }, { "epoch": 1.5496756137895942, "ewc_loss": 0.140625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.780216217041016, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8603367209434509, "num_tokens": 464832889.0, "step": 12182 }, { "epoch": 1.5498028240681847, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 42.03710174560547, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8547158241271973, "num_tokens": 464867743.0, "step": 12183 }, { "epoch": 1.5499300343467752, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.54465103149414, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8472754955291748, "num_tokens": 464904666.0, "step": 12184 }, { "epoch": 1.5500572446253658, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 42.05092239379883, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8635942339897156, "num_tokens": 464942886.0, "step": 12185 }, { "epoch": 1.5501844549039563, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.19966125488281, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.86030113697052, "num_tokens": 464977463.0, "step": 12186 }, { "epoch": 1.5503116651825466, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.07488250732422, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8691177368164062, "num_tokens": 465015732.0, "step": 12187 }, { "epoch": 1.5504388754611371, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.77606964111328, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8691394329071045, "num_tokens": 465060789.0, "step": 12188 }, { "epoch": 1.5505660857397277, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.82822799682617, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.856701672077179, "num_tokens": 465097907.0, "step": 12189 }, { "epoch": 1.5506932960183182, "ewc_loss": 0.140625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.555259704589844, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8675410747528076, "num_tokens": 465139401.0, "step": 12190 }, { "epoch": 1.5508205062969087, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.63678741455078, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.837942898273468, "num_tokens": 465176531.0, "step": 12191 }, { "epoch": 1.5509477165754992, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.48018264770508, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8704630732536316, "num_tokens": 465216074.0, "step": 12192 }, { "epoch": 1.5510749268540898, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.96699142456055, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8746199607849121, "num_tokens": 465257986.0, "step": 12193 }, { "epoch": 1.5512021371326803, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.377445220947266, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.862899661064148, "num_tokens": 465296038.0, "step": 12194 }, { "epoch": 1.5513293474112708, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.60639572143555, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8353006839752197, "num_tokens": 465334215.0, "step": 12195 }, { "epoch": 1.5514565576898613, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.98654556274414, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.859167754650116, "num_tokens": 465373362.0, "step": 12196 }, { "epoch": 1.5515837679684519, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.37648010253906, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8546078205108643, "num_tokens": 465410181.0, "step": 12197 }, { "epoch": 1.5517109782470424, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.014217376708984, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8667031526565552, "num_tokens": 465443056.0, "step": 12198 }, { "epoch": 1.551838188525633, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.504844665527344, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8631371259689331, "num_tokens": 465484509.0, "step": 12199 }, { "epoch": 1.5519653988042235, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.59115982055664, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8581176400184631, "num_tokens": 465520017.0, "step": 12200 }, { "epoch": 1.552092609082814, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.55946350097656, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8752996921539307, "num_tokens": 465557221.0, "step": 12201 }, { "epoch": 1.5522198193614045, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.47831726074219, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8692171573638916, "num_tokens": 465595455.0, "step": 12202 }, { "epoch": 1.552347029639995, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.90261459350586, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8605899810791016, "num_tokens": 465636436.0, "step": 12203 }, { "epoch": 1.5524742399185856, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.55288314819336, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8572108745574951, "num_tokens": 465668956.0, "step": 12204 }, { "epoch": 1.5526014501971759, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.65260314941406, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.840235710144043, "num_tokens": 465708812.0, "step": 12205 }, { "epoch": 1.5527286604757664, "ewc_loss": 0.140625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00011968612670898438, "grad_norm": 41.69733810424805, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8494446277618408, "num_tokens": 465742610.0, "step": 12206 }, { "epoch": 1.552855870754357, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.1419677734375, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.858931303024292, "num_tokens": 465776468.0, "step": 12207 }, { "epoch": 1.5529830810329475, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.95901107788086, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.876376748085022, "num_tokens": 465809300.0, "step": 12208 }, { "epoch": 1.553110291311538, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.54417419433594, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8618037104606628, "num_tokens": 465849847.0, "step": 12209 }, { "epoch": 1.5532375015901285, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.7874870300293, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8707417249679565, "num_tokens": 465884986.0, "step": 12210 }, { "epoch": 1.5533647118687188, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.101051330566406, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8688963651657104, "num_tokens": 465919710.0, "step": 12211 }, { "epoch": 1.5534919221473094, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.06768798828125, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8521055579185486, "num_tokens": 465956041.0, "step": 12212 }, { "epoch": 1.5536191324258999, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.89189529418945, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8718308210372925, "num_tokens": 465994352.0, "step": 12213 }, { "epoch": 1.5537463427044904, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.444862365722656, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8619829416275024, "num_tokens": 466032493.0, "step": 12214 }, { "epoch": 1.553873552983081, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.40621566772461, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8619674444198608, "num_tokens": 466067048.0, "step": 12215 }, { "epoch": 1.5540007632616715, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.40147399902344, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.836768388748169, "num_tokens": 466106895.0, "step": 12216 }, { "epoch": 1.554127973540262, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.85454559326172, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8459311723709106, "num_tokens": 466146740.0, "step": 12217 }, { "epoch": 1.5542551838188525, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.54667663574219, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8668426275253296, "num_tokens": 466187097.0, "step": 12218 }, { "epoch": 1.554382394097443, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.932979583740234, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8490579128265381, "num_tokens": 466223969.0, "step": 12219 }, { "epoch": 1.5545096043760336, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.37698745727539, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8705634474754333, "num_tokens": 466260127.0, "step": 12220 }, { "epoch": 1.554636814654624, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.80610275268555, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8654919862747192, "num_tokens": 466297219.0, "step": 12221 }, { "epoch": 1.5547640249332146, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.91569137573242, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.8400523066520691, "num_tokens": 466329474.0, "step": 12222 }, { "epoch": 1.5548912352118052, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.530704498291016, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8595795631408691, "num_tokens": 466372801.0, "step": 12223 }, { "epoch": 1.5550184454903957, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.631465911865234, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.868896484375, "num_tokens": 466412950.0, "step": 12224 }, { "epoch": 1.5551456557689862, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.60896682739258, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8660337328910828, "num_tokens": 466448830.0, "step": 12225 }, { "epoch": 1.5552728660475768, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.5024528503418, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8500149250030518, "num_tokens": 466489749.0, "step": 12226 }, { "epoch": 1.5554000763261673, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.53506851196289, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.842889666557312, "num_tokens": 466531320.0, "step": 12227 }, { "epoch": 1.5555272866047578, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.654396057128906, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.852657675743103, "num_tokens": 466570001.0, "step": 12228 }, { "epoch": 1.5556544968833483, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.41313934326172, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8633688688278198, "num_tokens": 466607186.0, "step": 12229 }, { "epoch": 1.5557817071619386, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.45693588256836, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8514515161514282, "num_tokens": 466649480.0, "step": 12230 }, { "epoch": 1.5559089174405292, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.1507682800293, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8664867877960205, "num_tokens": 466688861.0, "step": 12231 }, { "epoch": 1.5560361277191197, "ewc_loss": 0.140625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00011920928955078125, "grad_norm": 41.2302360534668, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8402938842773438, "num_tokens": 466723200.0, "step": 12232 }, { "epoch": 1.5561633379977102, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.1529541015625, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8447599411010742, "num_tokens": 466759639.0, "step": 12233 }, { "epoch": 1.5562905482763008, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.74714279174805, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8544645309448242, "num_tokens": 466795748.0, "step": 12234 }, { "epoch": 1.5564177585548913, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.968505859375, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8710227012634277, "num_tokens": 466832938.0, "step": 12235 }, { "epoch": 1.5565449688334816, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.99049758911133, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8528484106063843, "num_tokens": 466868710.0, "step": 12236 }, { "epoch": 1.5566721791120721, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 42.02665710449219, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8569307923316956, "num_tokens": 466903558.0, "step": 12237 }, { "epoch": 1.5567993893906626, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.65142059326172, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8497901558876038, "num_tokens": 466946785.0, "step": 12238 }, { "epoch": 1.5569265996692532, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.63262939453125, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8538316488265991, "num_tokens": 466988563.0, "step": 12239 }, { "epoch": 1.5570538099478437, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.66343307495117, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8533387184143066, "num_tokens": 467025447.0, "step": 12240 }, { "epoch": 1.5571810202264342, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.663475036621094, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8681153059005737, "num_tokens": 467060751.0, "step": 12241 }, { "epoch": 1.5573082305050248, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.47239303588867, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8586010336875916, "num_tokens": 467101889.0, "step": 12242 }, { "epoch": 1.5574354407836153, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.53501510620117, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8622944355010986, "num_tokens": 467136782.0, "step": 12243 }, { "epoch": 1.5575626510622058, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.5613899230957, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8680520057678223, "num_tokens": 467174137.0, "step": 12244 }, { "epoch": 1.5576898613407963, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.835445404052734, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8545597791671753, "num_tokens": 467213919.0, "step": 12245 }, { "epoch": 1.5578170716193869, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.63933181762695, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8664237260818481, "num_tokens": 467254860.0, "step": 12246 }, { "epoch": 1.5579442818979774, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.995079040527344, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8703619241714478, "num_tokens": 467296806.0, "step": 12247 }, { "epoch": 1.558071492176568, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.524593353271484, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8658517599105835, "num_tokens": 467332882.0, "step": 12248 }, { "epoch": 1.5581987024551585, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.270362854003906, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8750914335250854, "num_tokens": 467375088.0, "step": 12249 }, { "epoch": 1.558325912733749, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.40776062011719, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8578567504882812, "num_tokens": 467413000.0, "step": 12250 }, { "epoch": 1.5584531230123395, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.13030242919922, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8748269081115723, "num_tokens": 467450947.0, "step": 12251 }, { "epoch": 1.55858033329093, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.223514556884766, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8507481217384338, "num_tokens": 467493770.0, "step": 12252 }, { "epoch": 1.5587075435695206, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.0590705871582, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.857814371585846, "num_tokens": 467531882.0, "step": 12253 }, { "epoch": 1.5588347538481109, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.19062805175781, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8856195211410522, "num_tokens": 467566059.0, "step": 12254 }, { "epoch": 1.5589619641267014, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84214782714844, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8749136924743652, "num_tokens": 467603734.0, "step": 12255 }, { "epoch": 1.559089174405292, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.21668243408203, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8681193590164185, "num_tokens": 467641224.0, "step": 12256 }, { "epoch": 1.5592163846838825, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.75128936767578, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.857732355594635, "num_tokens": 467677576.0, "step": 12257 }, { "epoch": 1.559343594962473, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.648887634277344, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8677903413772583, "num_tokens": 467717526.0, "step": 12258 }, { "epoch": 1.5594708052410635, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.26163101196289, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8546056151390076, "num_tokens": 467757591.0, "step": 12259 }, { "epoch": 1.5595980155196538, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.52833557128906, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8471081256866455, "num_tokens": 467792794.0, "step": 12260 }, { "epoch": 1.5597252257982444, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.65187454223633, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8554195165634155, "num_tokens": 467829554.0, "step": 12261 }, { "epoch": 1.5598524360768349, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.091651916503906, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.844804048538208, "num_tokens": 467871403.0, "step": 12262 }, { "epoch": 1.5599796463554254, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.74928665161133, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8403803110122681, "num_tokens": 467906498.0, "step": 12263 }, { "epoch": 1.560106856634016, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.944862365722656, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8578227162361145, "num_tokens": 467946426.0, "step": 12264 }, { "epoch": 1.5602340669126065, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.034481048583984, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8626711368560791, "num_tokens": 467979930.0, "step": 12265 }, { "epoch": 1.560361277191197, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.79142761230469, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8540045022964478, "num_tokens": 468016347.0, "step": 12266 }, { "epoch": 1.5604884874697875, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.13230514526367, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8665094971656799, "num_tokens": 468051446.0, "step": 12267 }, { "epoch": 1.560615697748378, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 40.77214431762695, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8758894205093384, "num_tokens": 468085713.0, "step": 12268 }, { "epoch": 1.5607429080269686, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1169548034668, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8712065815925598, "num_tokens": 468117863.0, "step": 12269 }, { "epoch": 1.560870118305559, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.285953521728516, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8561314344406128, "num_tokens": 468153041.0, "step": 12270 }, { "epoch": 1.5609973285841496, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.60713577270508, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8573558330535889, "num_tokens": 468193624.0, "step": 12271 }, { "epoch": 1.5611245388627402, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.55290603637695, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8748198747634888, "num_tokens": 468227091.0, "step": 12272 }, { "epoch": 1.5612517491413307, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.10995864868164, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8551002144813538, "num_tokens": 468265079.0, "step": 12273 }, { "epoch": 1.5613789594199212, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.820838928222656, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8584510684013367, "num_tokens": 468307055.0, "step": 12274 }, { "epoch": 1.5615061696985117, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.224403381347656, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8545657396316528, "num_tokens": 468344429.0, "step": 12275 }, { "epoch": 1.5616333799771023, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.764644622802734, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.858601450920105, "num_tokens": 468379448.0, "step": 12276 }, { "epoch": 1.5617605902556928, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.299617767333984, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8640201687812805, "num_tokens": 468423344.0, "step": 12277 }, { "epoch": 1.5618878005342833, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.72370529174805, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8666834831237793, "num_tokens": 468461657.0, "step": 12278 }, { "epoch": 1.5620150108128736, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.515296936035156, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8747089505195618, "num_tokens": 468500428.0, "step": 12279 }, { "epoch": 1.5621422210914642, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.671470642089844, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8482880592346191, "num_tokens": 468536310.0, "step": 12280 }, { "epoch": 1.5622694313700547, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.956485748291016, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8582816123962402, "num_tokens": 468577954.0, "step": 12281 }, { "epoch": 1.5623966416486452, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.67081069946289, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8534080982208252, "num_tokens": 468612131.0, "step": 12282 }, { "epoch": 1.5625238519272358, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.79786682128906, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8552118539810181, "num_tokens": 468654022.0, "step": 12283 }, { "epoch": 1.5626510622058263, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.511138916015625, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8504123687744141, "num_tokens": 468693120.0, "step": 12284 }, { "epoch": 1.5627782724844166, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.42366027832031, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8584149479866028, "num_tokens": 468727695.0, "step": 12285 }, { "epoch": 1.5629054827630071, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.62135314941406, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8560309410095215, "num_tokens": 468766705.0, "step": 12286 }, { "epoch": 1.5630326930415976, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.162532806396484, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8491350412368774, "num_tokens": 468804555.0, "step": 12287 }, { "epoch": 1.5631599033201882, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84482955932617, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.855984091758728, "num_tokens": 468842495.0, "step": 12288 }, { "epoch": 1.5632871135987787, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7647819519043, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8569381237030029, "num_tokens": 468877574.0, "step": 12289 }, { "epoch": 1.5634143238773692, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.666595458984375, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8706728219985962, "num_tokens": 468914573.0, "step": 12290 }, { "epoch": 1.5635415341559598, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.59489440917969, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8690885901451111, "num_tokens": 468950943.0, "step": 12291 }, { "epoch": 1.5636687444345503, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7017822265625, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8754834532737732, "num_tokens": 468990007.0, "step": 12292 }, { "epoch": 1.5637959547131408, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.300968170166016, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8617616891860962, "num_tokens": 469022461.0, "step": 12293 }, { "epoch": 1.5639231649917313, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.12993621826172, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8656906485557556, "num_tokens": 469056396.0, "step": 12294 }, { "epoch": 1.5640503752703219, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.82909393310547, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8691402673721313, "num_tokens": 469099464.0, "step": 12295 }, { "epoch": 1.5641775855489124, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.1435661315918, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8664090633392334, "num_tokens": 469139830.0, "step": 12296 }, { "epoch": 1.564304795827503, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.4249267578125, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8590866327285767, "num_tokens": 469177838.0, "step": 12297 }, { "epoch": 1.5644320061060935, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.59385299682617, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8848733901977539, "num_tokens": 469218727.0, "step": 12298 }, { "epoch": 1.564559216384684, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.29685592651367, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8381310701370239, "num_tokens": 469261186.0, "step": 12299 }, { "epoch": 1.5646864266632745, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.37077713012695, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8666068315505981, "num_tokens": 469301191.0, "step": 12300 }, { "epoch": 1.564813636941865, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.76667785644531, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8655561208724976, "num_tokens": 469340132.0, "step": 12301 }, { "epoch": 1.5649408472204556, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.66015625, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8791607618331909, "num_tokens": 469376211.0, "step": 12302 }, { "epoch": 1.5650680574990459, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.426265716552734, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8495126366615295, "num_tokens": 469413961.0, "step": 12303 }, { "epoch": 1.5651952677776364, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.72480392456055, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.858850359916687, "num_tokens": 469449507.0, "step": 12304 }, { "epoch": 1.565322478056227, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.288421630859375, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8459212779998779, "num_tokens": 469486324.0, "step": 12305 }, { "epoch": 1.5654496883348175, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.8379020690918, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8550833463668823, "num_tokens": 469523799.0, "step": 12306 }, { "epoch": 1.565576898613408, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.03710174560547, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8451095819473267, "num_tokens": 469558661.0, "step": 12307 }, { "epoch": 1.5657041088919985, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.04209899902344, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.857887864112854, "num_tokens": 469603087.0, "step": 12308 }, { "epoch": 1.5658313191705888, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.16178512573242, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8595371842384338, "num_tokens": 469637344.0, "step": 12309 }, { "epoch": 1.5659585294491793, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.71234130859375, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.868473470211029, "num_tokens": 469681127.0, "step": 12310 }, { "epoch": 1.5660857397277699, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.64894485473633, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8578521013259888, "num_tokens": 469724238.0, "step": 12311 }, { "epoch": 1.5662129500063604, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.642147064208984, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8528032302856445, "num_tokens": 469765967.0, "step": 12312 }, { "epoch": 1.566340160284951, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84831237792969, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8623640537261963, "num_tokens": 469801387.0, "step": 12313 }, { "epoch": 1.5664673705635415, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.481529235839844, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8599679470062256, "num_tokens": 469835448.0, "step": 12314 }, { "epoch": 1.566594580842132, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.75802230834961, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8504701256752014, "num_tokens": 469876647.0, "step": 12315 }, { "epoch": 1.5667217911207225, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.6396484375, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8737753629684448, "num_tokens": 469918158.0, "step": 12316 }, { "epoch": 1.566849001399313, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7676887512207, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8439013957977295, "num_tokens": 469953691.0, "step": 12317 }, { "epoch": 1.5669762116779036, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.50663375854492, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8636183738708496, "num_tokens": 469996174.0, "step": 12318 }, { "epoch": 1.567103421956494, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.659385681152344, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8454633355140686, "num_tokens": 470031371.0, "step": 12319 }, { "epoch": 1.5672306322350846, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.9383659362793, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8643736839294434, "num_tokens": 470068783.0, "step": 12320 }, { "epoch": 1.5673578425136752, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.51966857910156, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.844088077545166, "num_tokens": 470103490.0, "step": 12321 }, { "epoch": 1.5674850527922657, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.00593566894531, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8555743098258972, "num_tokens": 470137213.0, "step": 12322 }, { "epoch": 1.5676122630708562, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.819210052490234, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8645082712173462, "num_tokens": 470171816.0, "step": 12323 }, { "epoch": 1.5677394733494467, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.763511657714844, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.8361254334449768, "num_tokens": 470210517.0, "step": 12324 }, { "epoch": 1.5678666836280373, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.71379852294922, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8481389284133911, "num_tokens": 470247085.0, "step": 12325 }, { "epoch": 1.5679938939066278, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.74900436401367, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8603558540344238, "num_tokens": 470288483.0, "step": 12326 }, { "epoch": 1.5681211041852183, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.03848648071289, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8611641526222229, "num_tokens": 470322173.0, "step": 12327 }, { "epoch": 1.5682483144638086, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.206626892089844, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8603034615516663, "num_tokens": 470356617.0, "step": 12328 }, { "epoch": 1.5683755247423992, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.081886291503906, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8705724477767944, "num_tokens": 470393095.0, "step": 12329 }, { "epoch": 1.5685027350209897, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.288856506347656, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8712844848632812, "num_tokens": 470427785.0, "step": 12330 }, { "epoch": 1.5686299452995802, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.5898551940918, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8546786904335022, "num_tokens": 470469971.0, "step": 12331 }, { "epoch": 1.5687571555781707, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.878440856933594, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8517584204673767, "num_tokens": 470503935.0, "step": 12332 }, { "epoch": 1.5688843658567613, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.817562103271484, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8630228638648987, "num_tokens": 470536014.0, "step": 12333 }, { "epoch": 1.5690115761353516, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.10860061645508, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8729631900787354, "num_tokens": 470575632.0, "step": 12334 }, { "epoch": 1.569138786413942, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.62481689453125, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8691166043281555, "num_tokens": 470611295.0, "step": 12335 }, { "epoch": 1.5692659966925326, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.94337844848633, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8590663075447083, "num_tokens": 470646993.0, "step": 12336 }, { "epoch": 1.5693932069711232, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.23412322998047, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8644440770149231, "num_tokens": 470680253.0, "step": 12337 }, { "epoch": 1.5695204172497137, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.16277313232422, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8595900535583496, "num_tokens": 470716169.0, "step": 12338 }, { "epoch": 1.5696476275283042, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.073204040527344, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.872504472732544, "num_tokens": 470754208.0, "step": 12339 }, { "epoch": 1.5697748378068948, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.38311004638672, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.861351728439331, "num_tokens": 470801329.0, "step": 12340 }, { "epoch": 1.5699020480854853, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.065982818603516, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8548292517662048, "num_tokens": 470838408.0, "step": 12341 }, { "epoch": 1.5700292583640758, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.260101318359375, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8597385287284851, "num_tokens": 470874001.0, "step": 12342 }, { "epoch": 1.5701564686426663, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.14097213745117, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8551372289657593, "num_tokens": 470916870.0, "step": 12343 }, { "epoch": 1.5702836789212569, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.32502746582031, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8676697015762329, "num_tokens": 470953862.0, "step": 12344 }, { "epoch": 1.5704108891998474, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.48857498168945, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8708890080451965, "num_tokens": 470991507.0, "step": 12345 }, { "epoch": 1.570538099478438, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.891056060791016, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.863484263420105, "num_tokens": 471030109.0, "step": 12346 }, { "epoch": 1.5706653097570284, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.54236602783203, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8743596076965332, "num_tokens": 471063829.0, "step": 12347 }, { "epoch": 1.570792520035619, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.494346618652344, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8510355949401855, "num_tokens": 471107259.0, "step": 12348 }, { "epoch": 1.5709197303142095, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.018890380859375, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.863817572593689, "num_tokens": 471141516.0, "step": 12349 }, { "epoch": 1.5710469405928, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.55582046508789, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8605961203575134, "num_tokens": 471178655.0, "step": 12350 }, { "epoch": 1.5711741508713906, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.248687744140625, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8725277185440063, "num_tokens": 471209437.0, "step": 12351 }, { "epoch": 1.5713013611499809, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.40107727050781, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8769488334655762, "num_tokens": 471249559.0, "step": 12352 }, { "epoch": 1.5714285714285714, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.87355041503906, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8658396601676941, "num_tokens": 471287179.0, "step": 12353 }, { "epoch": 1.571555781707162, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 40.89297103881836, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8636862635612488, "num_tokens": 471328052.0, "step": 12354 }, { "epoch": 1.5716829919857525, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.242374420166016, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.860223650932312, "num_tokens": 471362414.0, "step": 12355 }, { "epoch": 1.571810202264343, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.108455657958984, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.856471061706543, "num_tokens": 471397910.0, "step": 12356 }, { "epoch": 1.5719374125429335, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.952396392822266, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8745087385177612, "num_tokens": 471436951.0, "step": 12357 }, { "epoch": 1.5720646228215238, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.640018463134766, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8636142015457153, "num_tokens": 471470139.0, "step": 12358 }, { "epoch": 1.5721918331001143, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.64461135864258, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8679877519607544, "num_tokens": 471508591.0, "step": 12359 }, { "epoch": 1.5723190433787049, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.491146087646484, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8612231612205505, "num_tokens": 471549165.0, "step": 12360 }, { "epoch": 1.5724462536572954, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.30973434448242, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8541104793548584, "num_tokens": 471589021.0, "step": 12361 }, { "epoch": 1.572573463935886, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.70283508300781, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8534430861473083, "num_tokens": 471624263.0, "step": 12362 }, { "epoch": 1.5727006742144765, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.116424560546875, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8554537296295166, "num_tokens": 471663678.0, "step": 12363 }, { "epoch": 1.572827884493067, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.01194763183594, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8585090637207031, "num_tokens": 471705887.0, "step": 12364 }, { "epoch": 1.5729550947716575, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.327056884765625, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8696675300598145, "num_tokens": 471739796.0, "step": 12365 }, { "epoch": 1.573082305050248, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.32722854614258, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8581712245941162, "num_tokens": 471774723.0, "step": 12366 }, { "epoch": 1.5732095153288386, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.641212463378906, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8541499376296997, "num_tokens": 471814169.0, "step": 12367 }, { "epoch": 1.573336725607429, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.60797882080078, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8627200722694397, "num_tokens": 471852367.0, "step": 12368 }, { "epoch": 1.5734639358860196, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.48872375488281, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.867233395576477, "num_tokens": 471883438.0, "step": 12369 }, { "epoch": 1.5735911461646102, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.16764450073242, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8573161363601685, "num_tokens": 471921051.0, "step": 12370 }, { "epoch": 1.5737183564432007, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.26347351074219, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8648399710655212, "num_tokens": 471959685.0, "step": 12371 }, { "epoch": 1.5738455667217912, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.97309112548828, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8607463836669922, "num_tokens": 471997345.0, "step": 12372 }, { "epoch": 1.5739727770003817, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.50617218017578, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8675835132598877, "num_tokens": 472034455.0, "step": 12373 }, { "epoch": 1.5740999872789723, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.6533317565918, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8596992492675781, "num_tokens": 472073630.0, "step": 12374 }, { "epoch": 1.5742271975575628, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.84537887573242, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8587663173675537, "num_tokens": 472111556.0, "step": 12375 }, { "epoch": 1.5743544078361533, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.89518356323242, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8681004047393799, "num_tokens": 472148276.0, "step": 12376 }, { "epoch": 1.5744816181147436, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.71344757080078, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8605842590332031, "num_tokens": 472181665.0, "step": 12377 }, { "epoch": 1.5746088283933342, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.439491271972656, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8540229797363281, "num_tokens": 472222759.0, "step": 12378 }, { "epoch": 1.5747360386719247, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.541526794433594, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8558218479156494, "num_tokens": 472260867.0, "step": 12379 }, { "epoch": 1.5748632489505152, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.45551300048828, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8485344052314758, "num_tokens": 472300068.0, "step": 12380 }, { "epoch": 1.5749904592291057, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.35930633544922, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8711786866188049, "num_tokens": 472336459.0, "step": 12381 }, { "epoch": 1.5751176695076963, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.5718879699707, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8647964000701904, "num_tokens": 472376696.0, "step": 12382 }, { "epoch": 1.5752448797862866, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.5672607421875, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8728974461555481, "num_tokens": 472414491.0, "step": 12383 }, { "epoch": 1.575372090064877, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.83635711669922, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8608640432357788, "num_tokens": 472452188.0, "step": 12384 }, { "epoch": 1.5754993003434676, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.44661331176758, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.859819769859314, "num_tokens": 472489690.0, "step": 12385 }, { "epoch": 1.5756265106220582, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.827266693115234, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8596673011779785, "num_tokens": 472529912.0, "step": 12386 }, { "epoch": 1.5757537209006487, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.41521072387695, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.860758364200592, "num_tokens": 472562534.0, "step": 12387 }, { "epoch": 1.5758809311792392, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.455711364746094, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8586195707321167, "num_tokens": 472598955.0, "step": 12388 }, { "epoch": 1.5760081414578297, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.591835021972656, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8639513254165649, "num_tokens": 472638064.0, "step": 12389 }, { "epoch": 1.5761353517364203, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.162044525146484, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8648912906646729, "num_tokens": 472676823.0, "step": 12390 }, { "epoch": 1.5762625620150108, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.5685920715332, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8605157732963562, "num_tokens": 472718528.0, "step": 12391 }, { "epoch": 1.5763897722936013, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.23784637451172, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.867924690246582, "num_tokens": 472755634.0, "step": 12392 }, { "epoch": 1.5765169825721919, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.64545822143555, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8722963929176331, "num_tokens": 472798060.0, "step": 12393 }, { "epoch": 1.5766441928507824, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.09061050415039, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8664652109146118, "num_tokens": 472837628.0, "step": 12394 }, { "epoch": 1.576771403129373, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.31602478027344, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8629714846611023, "num_tokens": 472877446.0, "step": 12395 }, { "epoch": 1.5768986134079634, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.521060943603516, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8558546304702759, "num_tokens": 472918437.0, "step": 12396 }, { "epoch": 1.577025823686554, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.06123352050781, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8693336844444275, "num_tokens": 472956242.0, "step": 12397 }, { "epoch": 1.5771530339651445, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.13250732421875, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8716342449188232, "num_tokens": 472985620.0, "step": 12398 }, { "epoch": 1.577280244243735, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 40.8741340637207, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8648526668548584, "num_tokens": 473019517.0, "step": 12399 }, { "epoch": 1.5774074545223256, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.72100830078125, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8809661865234375, "num_tokens": 473054624.0, "step": 12400 }, { "epoch": 1.5775346648009159, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.96169662475586, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8514515161514282, "num_tokens": 473094140.0, "step": 12401 }, { "epoch": 1.5776618750795064, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.51324462890625, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8442765474319458, "num_tokens": 473136886.0, "step": 12402 }, { "epoch": 1.577789085358097, "ewc_loss": 0.1396484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00011873245239257812, "grad_norm": 40.88155746459961, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8493367433547974, "num_tokens": 473173267.0, "step": 12403 }, { "epoch": 1.5779162956366874, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.939781188964844, "learning_rate": 1e-06, "loss": 0.623, "mean_token_accuracy": 0.8393799066543579, "num_tokens": 473216477.0, "step": 12404 }, { "epoch": 1.578043505915278, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012063980102539062, "grad_norm": 41.51895523071289, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8527548313140869, "num_tokens": 473256986.0, "step": 12405 }, { "epoch": 1.5781707161938685, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.785762786865234, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8685148358345032, "num_tokens": 473297967.0, "step": 12406 }, { "epoch": 1.5782979264724588, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.931922912597656, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.853492021560669, "num_tokens": 473333580.0, "step": 12407 }, { "epoch": 1.5784251367510493, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.02174758911133, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8665550947189331, "num_tokens": 473370553.0, "step": 12408 }, { "epoch": 1.5785523470296399, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.32561492919922, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.842296838760376, "num_tokens": 473410705.0, "step": 12409 }, { "epoch": 1.5786795573082304, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.63105392456055, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8648756742477417, "num_tokens": 473443315.0, "step": 12410 }, { "epoch": 1.578806767586821, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.85944747924805, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8533217310905457, "num_tokens": 473488767.0, "step": 12411 }, { "epoch": 1.5789339778654115, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.89140701293945, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.876333475112915, "num_tokens": 473525360.0, "step": 12412 }, { "epoch": 1.579061188144002, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.038997650146484, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.852551281452179, "num_tokens": 473560625.0, "step": 12413 }, { "epoch": 1.5791883984225925, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.87467575073242, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8782860040664673, "num_tokens": 473595484.0, "step": 12414 }, { "epoch": 1.579315608701183, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.405635833740234, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8589832782745361, "num_tokens": 473627269.0, "step": 12415 }, { "epoch": 1.5794428189797736, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.20832061767578, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8459560871124268, "num_tokens": 473669159.0, "step": 12416 }, { "epoch": 1.579570029258364, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.381195068359375, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8713869452476501, "num_tokens": 473707739.0, "step": 12417 }, { "epoch": 1.5796972395369546, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.357913970947266, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.863282322883606, "num_tokens": 473746908.0, "step": 12418 }, { "epoch": 1.5798244498155452, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.53628158569336, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8585084080696106, "num_tokens": 473781480.0, "step": 12419 }, { "epoch": 1.5799516600941357, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.4478645324707, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8617966175079346, "num_tokens": 473814676.0, "step": 12420 }, { "epoch": 1.5800788703727262, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.527103424072266, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8559020161628723, "num_tokens": 473855487.0, "step": 12421 }, { "epoch": 1.5802060806513167, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.47686004638672, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8726097345352173, "num_tokens": 473885648.0, "step": 12422 }, { "epoch": 1.5803332909299073, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.52047348022461, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8574526309967041, "num_tokens": 473923862.0, "step": 12423 }, { "epoch": 1.5804605012084978, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.735755920410156, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8650099039077759, "num_tokens": 473957725.0, "step": 12424 }, { "epoch": 1.5805877114870883, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.932437896728516, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8622827529907227, "num_tokens": 473994939.0, "step": 12425 }, { "epoch": 1.5807149217656786, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 42.09868621826172, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8568423390388489, "num_tokens": 474035996.0, "step": 12426 }, { "epoch": 1.5808421320442692, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 42.19410705566406, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8700088858604431, "num_tokens": 474071693.0, "step": 12427 }, { "epoch": 1.5809693423228597, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.77226638793945, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8683493137359619, "num_tokens": 474114415.0, "step": 12428 }, { "epoch": 1.5810965526014502, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.99275207519531, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8547002077102661, "num_tokens": 474152060.0, "step": 12429 }, { "epoch": 1.5812237628800407, "ewc_loss": 0.1416015625, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001201629638671875, "grad_norm": 41.880645751953125, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8637023568153381, "num_tokens": 474190605.0, "step": 12430 }, { "epoch": 1.5813509731586313, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.76600646972656, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.855531632900238, "num_tokens": 474229447.0, "step": 12431 }, { "epoch": 1.5814781834372216, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.198238372802734, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8688650727272034, "num_tokens": 474268088.0, "step": 12432 }, { "epoch": 1.581605393715812, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.30709457397461, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8562537431716919, "num_tokens": 474304856.0, "step": 12433 }, { "epoch": 1.5817326039944026, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.60375213623047, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8652119040489197, "num_tokens": 474349248.0, "step": 12434 }, { "epoch": 1.5818598142729932, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.51283645629883, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8716548085212708, "num_tokens": 474384432.0, "step": 12435 }, { "epoch": 1.5819870245515837, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1014404296875, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8582234382629395, "num_tokens": 474415881.0, "step": 12436 }, { "epoch": 1.5821142348301742, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.37665557861328, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.859731137752533, "num_tokens": 474452755.0, "step": 12437 }, { "epoch": 1.5822414451087647, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.759849548339844, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8550809621810913, "num_tokens": 474493937.0, "step": 12438 }, { "epoch": 1.5823686553873553, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.062652587890625, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.867700457572937, "num_tokens": 474534072.0, "step": 12439 }, { "epoch": 1.5824958656659458, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.9013557434082, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8534039855003357, "num_tokens": 474569343.0, "step": 12440 }, { "epoch": 1.5826230759445363, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012159347534179688, "grad_norm": 41.921627044677734, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8506821393966675, "num_tokens": 474606610.0, "step": 12441 }, { "epoch": 1.5827502862231269, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.123287200927734, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8517175912857056, "num_tokens": 474645071.0, "step": 12442 }, { "epoch": 1.5828774965017174, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.741607666015625, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8487957715988159, "num_tokens": 474683985.0, "step": 12443 }, { "epoch": 1.583004706780308, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.01022720336914, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8676767349243164, "num_tokens": 474726102.0, "step": 12444 }, { "epoch": 1.5831319170588984, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.273868560791016, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8642024397850037, "num_tokens": 474766430.0, "step": 12445 }, { "epoch": 1.583259127337489, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.57149124145508, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8586989641189575, "num_tokens": 474800648.0, "step": 12446 }, { "epoch": 1.5833863376160795, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.22597885131836, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8648801445960999, "num_tokens": 474845540.0, "step": 12447 }, { "epoch": 1.58351354789467, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.3739013671875, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8585692644119263, "num_tokens": 474884296.0, "step": 12448 }, { "epoch": 1.5836407581732606, "ewc_loss": 0.142578125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012111663818359375, "grad_norm": 41.16261291503906, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8654544949531555, "num_tokens": 474929344.0, "step": 12449 }, { "epoch": 1.5837679684518509, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.295982360839844, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.865086555480957, "num_tokens": 474968035.0, "step": 12450 }, { "epoch": 1.5838951787304414, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.3651237487793, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8547270894050598, "num_tokens": 475006189.0, "step": 12451 }, { "epoch": 1.584022389009032, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.16866683959961, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.856227695941925, "num_tokens": 475039768.0, "step": 12452 }, { "epoch": 1.5841495992876224, "ewc_loss": 0.1435546875, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001220703125, "grad_norm": 41.71453094482422, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8667817115783691, "num_tokens": 475079938.0, "step": 12453 }, { "epoch": 1.584276809566213, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.48472595214844, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8488888740539551, "num_tokens": 475122405.0, "step": 12454 }, { "epoch": 1.5844040198448035, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.089420318603516, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8537287712097168, "num_tokens": 475158237.0, "step": 12455 }, { "epoch": 1.5845312301233938, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.81254196166992, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.86257004737854, "num_tokens": 475194966.0, "step": 12456 }, { "epoch": 1.5846584404019843, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.03056335449219, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8592312335968018, "num_tokens": 475234483.0, "step": 12457 }, { "epoch": 1.5847856506805749, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.96432876586914, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8551024198532104, "num_tokens": 475275172.0, "step": 12458 }, { "epoch": 1.5849128609591654, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.17864227294922, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.865881621837616, "num_tokens": 475313550.0, "step": 12459 }, { "epoch": 1.585040071237756, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.768306732177734, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.868717610836029, "num_tokens": 475357019.0, "step": 12460 }, { "epoch": 1.5851672815163464, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.93174362182617, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8653153777122498, "num_tokens": 475394788.0, "step": 12461 }, { "epoch": 1.585294491794937, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.97341537475586, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8583322763442993, "num_tokens": 475432354.0, "step": 12462 }, { "epoch": 1.5854217020735275, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.781551361083984, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8642557859420776, "num_tokens": 475467284.0, "step": 12463 }, { "epoch": 1.585548912352118, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.493553161621094, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8532529473304749, "num_tokens": 475505946.0, "step": 12464 }, { "epoch": 1.5856761226307086, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.022525787353516, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8479677438735962, "num_tokens": 475542799.0, "step": 12465 }, { "epoch": 1.585803332909299, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.26312255859375, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.860795259475708, "num_tokens": 475582064.0, "step": 12466 }, { "epoch": 1.5859305431878896, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.195350646972656, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8563574552536011, "num_tokens": 475615521.0, "step": 12467 }, { "epoch": 1.5860577534664801, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.34144592285156, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8682781457901001, "num_tokens": 475650500.0, "step": 12468 }, { "epoch": 1.5861849637450707, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.709197998046875, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8622480630874634, "num_tokens": 475695824.0, "step": 12469 }, { "epoch": 1.5863121740236612, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 40.904903411865234, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.8384559750556946, "num_tokens": 475734015.0, "step": 12470 }, { "epoch": 1.5864393843022517, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.0402717590332, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8592066764831543, "num_tokens": 475768961.0, "step": 12471 }, { "epoch": 1.5865665945808423, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.054439544677734, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8755800127983093, "num_tokens": 475806551.0, "step": 12472 }, { "epoch": 1.5866938048594328, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.25323486328125, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8548975586891174, "num_tokens": 475844066.0, "step": 12473 }, { "epoch": 1.5868210151380233, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.514259338378906, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8640743494033813, "num_tokens": 475874554.0, "step": 12474 }, { "epoch": 1.5869482254166136, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.987091064453125, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8667945265769958, "num_tokens": 475915840.0, "step": 12475 }, { "epoch": 1.5870754356952042, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.66572570800781, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8595978021621704, "num_tokens": 475951516.0, "step": 12476 }, { "epoch": 1.5872026459737947, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.933475494384766, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8629779815673828, "num_tokens": 475984224.0, "step": 12477 }, { "epoch": 1.5873298562523852, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.797908782958984, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8379660844802856, "num_tokens": 476015758.0, "step": 12478 }, { "epoch": 1.5874570665309757, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.73978042602539, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8563747406005859, "num_tokens": 476051835.0, "step": 12479 }, { "epoch": 1.5875842768095663, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.28275680541992, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8574721217155457, "num_tokens": 476090667.0, "step": 12480 }, { "epoch": 1.5877114870881566, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.09400939941406, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.85933518409729, "num_tokens": 476130448.0, "step": 12481 }, { "epoch": 1.587838697366747, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.322975158691406, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8517101407051086, "num_tokens": 476174495.0, "step": 12482 }, { "epoch": 1.5879659076453376, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.58383560180664, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8682342171669006, "num_tokens": 476213412.0, "step": 12483 }, { "epoch": 1.5880931179239282, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.8454704284668, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8744735717773438, "num_tokens": 476250441.0, "step": 12484 }, { "epoch": 1.5882203282025187, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.545921325683594, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8737304210662842, "num_tokens": 476284466.0, "step": 12485 }, { "epoch": 1.5883475384811092, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.985443115234375, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8583770990371704, "num_tokens": 476327760.0, "step": 12486 }, { "epoch": 1.5884747487596997, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.82402038574219, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8556014895439148, "num_tokens": 476369259.0, "step": 12487 }, { "epoch": 1.5886019590382903, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.4612922668457, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8626182675361633, "num_tokens": 476403915.0, "step": 12488 }, { "epoch": 1.5887291693168808, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.889488220214844, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8590394258499146, "num_tokens": 476440837.0, "step": 12489 }, { "epoch": 1.5888563795954713, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.60133361816406, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8589905500411987, "num_tokens": 476478478.0, "step": 12490 }, { "epoch": 1.5889835898740619, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7975959777832, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8682241439819336, "num_tokens": 476514675.0, "step": 12491 }, { "epoch": 1.5891108001526524, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.099239349365234, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8568626642227173, "num_tokens": 476559015.0, "step": 12492 }, { "epoch": 1.589238010431243, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.39680480957031, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8448246717453003, "num_tokens": 476601848.0, "step": 12493 }, { "epoch": 1.5893652207098334, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.419586181640625, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.868491530418396, "num_tokens": 476636794.0, "step": 12494 }, { "epoch": 1.589492430988424, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.423343658447266, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8548177480697632, "num_tokens": 476671593.0, "step": 12495 }, { "epoch": 1.5896196412670145, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.97072982788086, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8705755472183228, "num_tokens": 476706776.0, "step": 12496 }, { "epoch": 1.589746851545605, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.303504943847656, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8594090342521667, "num_tokens": 476746700.0, "step": 12497 }, { "epoch": 1.5898740618241956, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.296234130859375, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8662057518959045, "num_tokens": 476785837.0, "step": 12498 }, { "epoch": 1.5900012721027859, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.604103088378906, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8673602938652039, "num_tokens": 476818894.0, "step": 12499 }, { "epoch": 1.5901284823813764, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.632171630859375, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8424433469772339, "num_tokens": 476861773.0, "step": 12500 }, { "epoch": 1.590255692659967, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.19408416748047, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8656885623931885, "num_tokens": 476899522.0, "step": 12501 }, { "epoch": 1.5903829029385574, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.74424362182617, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8527327179908752, "num_tokens": 476939923.0, "step": 12502 }, { "epoch": 1.590510113217148, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.87417221069336, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8607823848724365, "num_tokens": 476979867.0, "step": 12503 }, { "epoch": 1.5906373234957385, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04124450683594, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8682454824447632, "num_tokens": 477016068.0, "step": 12504 }, { "epoch": 1.5907645337743288, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.149776458740234, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8565946817398071, "num_tokens": 477054526.0, "step": 12505 }, { "epoch": 1.5908917440529193, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.95996856689453, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8555128574371338, "num_tokens": 477098029.0, "step": 12506 }, { "epoch": 1.5910189543315099, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.05930709838867, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8665021657943726, "num_tokens": 477137070.0, "step": 12507 }, { "epoch": 1.5911461646101004, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.981937408447266, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8600378036499023, "num_tokens": 477175640.0, "step": 12508 }, { "epoch": 1.591273374888691, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.69917297363281, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8623713254928589, "num_tokens": 477210935.0, "step": 12509 }, { "epoch": 1.5914005851672814, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.842464447021484, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8676918148994446, "num_tokens": 477251567.0, "step": 12510 }, { "epoch": 1.591527795445872, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.30535888671875, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8623763918876648, "num_tokens": 477287964.0, "step": 12511 }, { "epoch": 1.5916550057244625, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.05910873413086, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8364202976226807, "num_tokens": 477328660.0, "step": 12512 }, { "epoch": 1.591782216003053, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.57315444946289, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8655481934547424, "num_tokens": 477364111.0, "step": 12513 }, { "epoch": 1.5919094262816436, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.48957443237305, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8674106597900391, "num_tokens": 477402691.0, "step": 12514 }, { "epoch": 1.592036636560234, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.827816009521484, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8734984993934631, "num_tokens": 477437526.0, "step": 12515 }, { "epoch": 1.5921638468388246, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.237144470214844, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8593021631240845, "num_tokens": 477479175.0, "step": 12516 }, { "epoch": 1.5922910571174151, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.6756591796875, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8625613451004028, "num_tokens": 477516896.0, "step": 12517 }, { "epoch": 1.5924182673960057, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.11037063598633, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.850622832775116, "num_tokens": 477555543.0, "step": 12518 }, { "epoch": 1.5925454776745962, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.55390548706055, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8717426061630249, "num_tokens": 477592251.0, "step": 12519 }, { "epoch": 1.5926726879531867, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.19237518310547, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8474407196044922, "num_tokens": 477630039.0, "step": 12520 }, { "epoch": 1.5927998982317773, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.364837646484375, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8594970703125, "num_tokens": 477661028.0, "step": 12521 }, { "epoch": 1.5929271085103678, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.99677658081055, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8736447095870972, "num_tokens": 477699612.0, "step": 12522 }, { "epoch": 1.5930543187889583, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.46991729736328, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8559211492538452, "num_tokens": 477737300.0, "step": 12523 }, { "epoch": 1.5931815290675486, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.21780776977539, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8495694398880005, "num_tokens": 477776031.0, "step": 12524 }, { "epoch": 1.5933087393461391, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.29355239868164, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8608336448669434, "num_tokens": 477813640.0, "step": 12525 }, { "epoch": 1.5934359496247297, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.96067428588867, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8572101593017578, "num_tokens": 477848652.0, "step": 12526 }, { "epoch": 1.5935631599033202, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.54774856567383, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8568909168243408, "num_tokens": 477883710.0, "step": 12527 }, { "epoch": 1.5936903701819107, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.52696990966797, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8537420630455017, "num_tokens": 477920316.0, "step": 12528 }, { "epoch": 1.5938175804605013, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.314369201660156, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8684550523757935, "num_tokens": 477955523.0, "step": 12529 }, { "epoch": 1.5939447907390916, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.30521774291992, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8528435230255127, "num_tokens": 477995768.0, "step": 12530 }, { "epoch": 1.594072001017682, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.06446075439453, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8604866862297058, "num_tokens": 478037718.0, "step": 12531 }, { "epoch": 1.5941992112962726, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.2773551940918, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8485103845596313, "num_tokens": 478077388.0, "step": 12532 }, { "epoch": 1.5943264215748632, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.266239166259766, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8751593232154846, "num_tokens": 478109494.0, "step": 12533 }, { "epoch": 1.5944536318534537, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.53928756713867, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8555089235305786, "num_tokens": 478142420.0, "step": 12534 }, { "epoch": 1.5945808421320442, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.22418975830078, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8628935813903809, "num_tokens": 478183848.0, "step": 12535 }, { "epoch": 1.5947080524106347, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.13013458251953, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8685235381126404, "num_tokens": 478225242.0, "step": 12536 }, { "epoch": 1.5948352626892253, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.047630310058594, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8673941493034363, "num_tokens": 478263167.0, "step": 12537 }, { "epoch": 1.5949624729678158, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.59599304199219, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8620392680168152, "num_tokens": 478297752.0, "step": 12538 }, { "epoch": 1.5950896832464063, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.174713134765625, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8632021546363831, "num_tokens": 478332955.0, "step": 12539 }, { "epoch": 1.5952168935249968, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.46733093261719, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8406746983528137, "num_tokens": 478367298.0, "step": 12540 }, { "epoch": 1.5953441038035874, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.27091598510742, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8649081587791443, "num_tokens": 478411183.0, "step": 12541 }, { "epoch": 1.595471314082178, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.79319763183594, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8571886420249939, "num_tokens": 478451184.0, "step": 12542 }, { "epoch": 1.5955985243607684, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.89650344848633, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8557175397872925, "num_tokens": 478490913.0, "step": 12543 }, { "epoch": 1.595725734639359, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.992584228515625, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8620621562004089, "num_tokens": 478529769.0, "step": 12544 }, { "epoch": 1.5958529449179495, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.823097229003906, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8563151955604553, "num_tokens": 478566665.0, "step": 12545 }, { "epoch": 1.59598015519654, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.770362854003906, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8884406089782715, "num_tokens": 478600814.0, "step": 12546 }, { "epoch": 1.5961073654751305, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.127777099609375, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.864556074142456, "num_tokens": 478641610.0, "step": 12547 }, { "epoch": 1.5962345757537209, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.707035064697266, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.867053210735321, "num_tokens": 478678183.0, "step": 12548 }, { "epoch": 1.5963617860323114, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.54751968383789, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8633335828781128, "num_tokens": 478710881.0, "step": 12549 }, { "epoch": 1.596488996310902, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.6240119934082, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8750956058502197, "num_tokens": 478748550.0, "step": 12550 }, { "epoch": 1.5966162065894924, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.19375991821289, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.850008487701416, "num_tokens": 478790417.0, "step": 12551 }, { "epoch": 1.596743416868083, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.85701370239258, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8677351474761963, "num_tokens": 478828856.0, "step": 12552 }, { "epoch": 1.5968706271466735, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.64616394042969, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8578109741210938, "num_tokens": 478870240.0, "step": 12553 }, { "epoch": 1.5969978374252638, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.87018585205078, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8508777618408203, "num_tokens": 478911920.0, "step": 12554 }, { "epoch": 1.5971250477038543, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.73701477050781, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.854218065738678, "num_tokens": 478949083.0, "step": 12555 }, { "epoch": 1.5972522579824449, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.994041442871094, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.859201192855835, "num_tokens": 478984087.0, "step": 12556 }, { "epoch": 1.5973794682610354, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.92626190185547, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.8411361575126648, "num_tokens": 479019364.0, "step": 12557 }, { "epoch": 1.597506678539626, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.92429733276367, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8636146783828735, "num_tokens": 479059002.0, "step": 12558 }, { "epoch": 1.5976338888182164, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.26411819458008, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8605220317840576, "num_tokens": 479092676.0, "step": 12559 }, { "epoch": 1.597761099096807, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.937557220458984, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8632960319519043, "num_tokens": 479125951.0, "step": 12560 }, { "epoch": 1.5978883093753975, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.542354583740234, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8782341480255127, "num_tokens": 479166230.0, "step": 12561 }, { "epoch": 1.598015519653988, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.8967170715332, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8568360209465027, "num_tokens": 479205813.0, "step": 12562 }, { "epoch": 1.5981427299325786, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.594303131103516, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8563037514686584, "num_tokens": 479244960.0, "step": 12563 }, { "epoch": 1.598269940211169, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.912986755371094, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8677971959114075, "num_tokens": 479289617.0, "step": 12564 }, { "epoch": 1.5983971504897596, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.68368148803711, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8610439896583557, "num_tokens": 479326377.0, "step": 12565 }, { "epoch": 1.5985243607683501, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.06370162963867, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8601387739181519, "num_tokens": 479360563.0, "step": 12566 }, { "epoch": 1.5986515710469407, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.76237487792969, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8718822598457336, "num_tokens": 479403042.0, "step": 12567 }, { "epoch": 1.5987787813255312, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.28815841674805, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8672224283218384, "num_tokens": 479437099.0, "step": 12568 }, { "epoch": 1.5989059916041217, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.54554748535156, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8588892221450806, "num_tokens": 479479326.0, "step": 12569 }, { "epoch": 1.5990332018827123, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.982784271240234, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.841444194316864, "num_tokens": 479515320.0, "step": 12570 }, { "epoch": 1.5991604121613028, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04774475097656, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8499681353569031, "num_tokens": 479559960.0, "step": 12571 }, { "epoch": 1.5992876224398933, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.071109771728516, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.860659122467041, "num_tokens": 479599115.0, "step": 12572 }, { "epoch": 1.5994148327184836, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.63621139526367, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8696594834327698, "num_tokens": 479636116.0, "step": 12573 }, { "epoch": 1.5995420429970741, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.9332275390625, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.85206139087677, "num_tokens": 479676880.0, "step": 12574 }, { "epoch": 1.5996692532756647, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.16611099243164, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8624349236488342, "num_tokens": 479717424.0, "step": 12575 }, { "epoch": 1.5997964635542552, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.10630798339844, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8600059747695923, "num_tokens": 479756240.0, "step": 12576 }, { "epoch": 1.5999236738328457, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.02098083496094, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.847422182559967, "num_tokens": 479791863.0, "step": 12577 }, { "epoch": 1.6000508841114363, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.281028747558594, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8708394765853882, "num_tokens": 479827252.0, "step": 12578 }, { "epoch": 1.6001780943900266, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.97041702270508, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8571506142616272, "num_tokens": 479863100.0, "step": 12579 }, { "epoch": 1.600305304668617, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07978439331055, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8500118851661682, "num_tokens": 479899684.0, "step": 12580 }, { "epoch": 1.6004325149472076, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.66037368774414, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8614538311958313, "num_tokens": 479937656.0, "step": 12581 }, { "epoch": 1.6005597252257981, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.67652893066406, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8547723293304443, "num_tokens": 479973357.0, "step": 12582 }, { "epoch": 1.6006869355043887, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15724182128906, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8615018129348755, "num_tokens": 480017862.0, "step": 12583 }, { "epoch": 1.6008141457829792, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.46462631225586, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8669230937957764, "num_tokens": 480057039.0, "step": 12584 }, { "epoch": 1.6009413560615697, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.62105178833008, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8642955422401428, "num_tokens": 480093396.0, "step": 12585 }, { "epoch": 1.6010685663401603, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.249244689941406, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8581703305244446, "num_tokens": 480125200.0, "step": 12586 }, { "epoch": 1.6011957766187508, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.53767013549805, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8570730686187744, "num_tokens": 480158085.0, "step": 12587 }, { "epoch": 1.6013229868973413, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.76066589355469, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8758873343467712, "num_tokens": 480192979.0, "step": 12588 }, { "epoch": 1.6014501971759318, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.10853576660156, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8713278770446777, "num_tokens": 480227984.0, "step": 12589 }, { "epoch": 1.6015774074545224, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.71038818359375, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8784778118133545, "num_tokens": 480264280.0, "step": 12590 }, { "epoch": 1.601704617733113, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.18327331542969, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.845625102519989, "num_tokens": 480300670.0, "step": 12591 }, { "epoch": 1.6018318280117034, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.96122741699219, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8598509430885315, "num_tokens": 480334460.0, "step": 12592 }, { "epoch": 1.601959038290294, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84923553466797, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8712872266769409, "num_tokens": 480376066.0, "step": 12593 }, { "epoch": 1.6020862485688845, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.772037506103516, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8667850494384766, "num_tokens": 480417575.0, "step": 12594 }, { "epoch": 1.602213458847475, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.82435607910156, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8505271673202515, "num_tokens": 480454106.0, "step": 12595 }, { "epoch": 1.6023406691260655, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.62106704711914, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8471622467041016, "num_tokens": 480493031.0, "step": 12596 }, { "epoch": 1.6024678794046558, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1627311706543, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.861714243888855, "num_tokens": 480528326.0, "step": 12597 }, { "epoch": 1.6025950896832464, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.630210876464844, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.867743968963623, "num_tokens": 480565399.0, "step": 12598 }, { "epoch": 1.602722299961837, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.58391571044922, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8763432502746582, "num_tokens": 480600028.0, "step": 12599 }, { "epoch": 1.6028495102404274, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.44150924682617, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8605074882507324, "num_tokens": 480635759.0, "step": 12600 }, { "epoch": 1.602976720519018, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.46027374267578, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8672289848327637, "num_tokens": 480669528.0, "step": 12601 }, { "epoch": 1.6031039307976085, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.29868698120117, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8587489128112793, "num_tokens": 480709425.0, "step": 12602 }, { "epoch": 1.6032311410761988, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.222251892089844, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8451756238937378, "num_tokens": 480745010.0, "step": 12603 }, { "epoch": 1.6033583513547893, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.87189865112305, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8682574033737183, "num_tokens": 480780320.0, "step": 12604 }, { "epoch": 1.6034855616333799, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.749080657958984, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8690513968467712, "num_tokens": 480819335.0, "step": 12605 }, { "epoch": 1.6036127719119704, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.915924072265625, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8738722801208496, "num_tokens": 480852702.0, "step": 12606 }, { "epoch": 1.603739982190561, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.43994903564453, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8551206588745117, "num_tokens": 480890258.0, "step": 12607 }, { "epoch": 1.6038671924691514, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.78981399536133, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8606021404266357, "num_tokens": 480929555.0, "step": 12608 }, { "epoch": 1.603994402747742, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.33658981323242, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8840211629867554, "num_tokens": 480970295.0, "step": 12609 }, { "epoch": 1.6041216130263325, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.12664794921875, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8634204268455505, "num_tokens": 481011213.0, "step": 12610 }, { "epoch": 1.604248823304923, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.09038543701172, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.855682373046875, "num_tokens": 481045273.0, "step": 12611 }, { "epoch": 1.6043760335835135, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.057743072509766, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8400804996490479, "num_tokens": 481084567.0, "step": 12612 }, { "epoch": 1.604503243862104, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.22178268432617, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8620529174804688, "num_tokens": 481119455.0, "step": 12613 }, { "epoch": 1.6046304541406946, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.24824523925781, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8459175825119019, "num_tokens": 481154718.0, "step": 12614 }, { "epoch": 1.6047576644192851, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.744503021240234, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8554936647415161, "num_tokens": 481193716.0, "step": 12615 }, { "epoch": 1.6048848746978757, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2816276550293, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.8376729488372803, "num_tokens": 481231378.0, "step": 12616 }, { "epoch": 1.6050120849764662, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.69359588623047, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8585805892944336, "num_tokens": 481267907.0, "step": 12617 }, { "epoch": 1.6051392952550567, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.3216438293457, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8616770505905151, "num_tokens": 481309899.0, "step": 12618 }, { "epoch": 1.6052665055336472, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.973934173583984, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8708909749984741, "num_tokens": 481348845.0, "step": 12619 }, { "epoch": 1.6053937158122378, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.32833480834961, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8525460362434387, "num_tokens": 481384999.0, "step": 12620 }, { "epoch": 1.6055209260908283, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.80330276489258, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8618859648704529, "num_tokens": 481417287.0, "step": 12621 }, { "epoch": 1.6056481363694186, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.35512924194336, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.865425705909729, "num_tokens": 481455943.0, "step": 12622 }, { "epoch": 1.6057753466480091, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.96675491333008, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8765078783035278, "num_tokens": 481492017.0, "step": 12623 }, { "epoch": 1.6059025569265997, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.02409744262695, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8526329398155212, "num_tokens": 481535115.0, "step": 12624 }, { "epoch": 1.6060297672051902, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.63036346435547, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8457221984863281, "num_tokens": 481580048.0, "step": 12625 }, { "epoch": 1.6061569774837807, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.79351806640625, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8704390525817871, "num_tokens": 481616748.0, "step": 12626 }, { "epoch": 1.6062841877623713, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.61977005004883, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8330027461051941, "num_tokens": 481659123.0, "step": 12627 }, { "epoch": 1.6064113980409616, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.68156051635742, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.850918710231781, "num_tokens": 481704791.0, "step": 12628 }, { "epoch": 1.606538608319552, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.64419937133789, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8468044400215149, "num_tokens": 481742913.0, "step": 12629 }, { "epoch": 1.6066658185981426, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.606361389160156, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8473892211914062, "num_tokens": 481783549.0, "step": 12630 }, { "epoch": 1.6067930288767331, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.209739685058594, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8667865991592407, "num_tokens": 481818486.0, "step": 12631 }, { "epoch": 1.6069202391553237, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.920196533203125, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8621059656143188, "num_tokens": 481849969.0, "step": 12632 }, { "epoch": 1.6070474494339142, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.02388000488281, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8448072671890259, "num_tokens": 481886108.0, "step": 12633 }, { "epoch": 1.6071746597125047, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.20891571044922, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8721051216125488, "num_tokens": 481919389.0, "step": 12634 }, { "epoch": 1.6073018699910953, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.54698181152344, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8740209341049194, "num_tokens": 481958531.0, "step": 12635 }, { "epoch": 1.6074290802696858, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.44036102294922, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8559889793395996, "num_tokens": 481999864.0, "step": 12636 }, { "epoch": 1.6075562905482763, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.610809326171875, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8829489946365356, "num_tokens": 482033890.0, "step": 12637 }, { "epoch": 1.6076835008268668, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.450592041015625, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8661867380142212, "num_tokens": 482071186.0, "step": 12638 }, { "epoch": 1.6078107111054574, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.69047546386719, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8597079515457153, "num_tokens": 482107218.0, "step": 12639 }, { "epoch": 1.607937921384048, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31642150878906, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8563704490661621, "num_tokens": 482146101.0, "step": 12640 }, { "epoch": 1.6080651316626384, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.774505615234375, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.875108540058136, "num_tokens": 482178531.0, "step": 12641 }, { "epoch": 1.608192341941229, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.954917907714844, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8691089153289795, "num_tokens": 482217019.0, "step": 12642 }, { "epoch": 1.6083195522198195, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.641727447509766, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8630063533782959, "num_tokens": 482250547.0, "step": 12643 }, { "epoch": 1.60844676249841, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.94296646118164, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8572930097579956, "num_tokens": 482287009.0, "step": 12644 }, { "epoch": 1.6085739727770005, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.83633041381836, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8483232855796814, "num_tokens": 482327373.0, "step": 12645 }, { "epoch": 1.6087011830555908, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1136474609375, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8506875038146973, "num_tokens": 482363409.0, "step": 12646 }, { "epoch": 1.6088283933341814, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.762142181396484, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8497554063796997, "num_tokens": 482396499.0, "step": 12647 }, { "epoch": 1.608955603612772, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.99441909790039, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8630333542823792, "num_tokens": 482435059.0, "step": 12648 }, { "epoch": 1.6090828138913624, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.67169189453125, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8521313071250916, "num_tokens": 482472793.0, "step": 12649 }, { "epoch": 1.609210024169953, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31254577636719, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8619116544723511, "num_tokens": 482508857.0, "step": 12650 }, { "epoch": 1.6093372344485435, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07051467895508, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.845463752746582, "num_tokens": 482548526.0, "step": 12651 }, { "epoch": 1.6094644447271338, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2635498046875, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8521634340286255, "num_tokens": 482586461.0, "step": 12652 }, { "epoch": 1.6095916550057243, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.937828063964844, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8528232574462891, "num_tokens": 482624642.0, "step": 12653 }, { "epoch": 1.6097188652843148, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.98746109008789, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8580089807510376, "num_tokens": 482664096.0, "step": 12654 }, { "epoch": 1.6098460755629054, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.80366897583008, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8720681667327881, "num_tokens": 482701230.0, "step": 12655 }, { "epoch": 1.609973285841496, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.65583801269531, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8605749607086182, "num_tokens": 482741611.0, "step": 12656 }, { "epoch": 1.6101004961200864, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.67216491699219, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.871082067489624, "num_tokens": 482779482.0, "step": 12657 }, { "epoch": 1.610227706398677, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.573768615722656, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8691003322601318, "num_tokens": 482816279.0, "step": 12658 }, { "epoch": 1.6103549166772675, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.42170333862305, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8508950471878052, "num_tokens": 482859941.0, "step": 12659 }, { "epoch": 1.610482126955858, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.14813232421875, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8649080991744995, "num_tokens": 482894348.0, "step": 12660 }, { "epoch": 1.6106093372344485, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.34333038330078, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8553183674812317, "num_tokens": 482930882.0, "step": 12661 }, { "epoch": 1.610736547513039, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.20237731933594, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.8427517414093018, "num_tokens": 482969515.0, "step": 12662 }, { "epoch": 1.6108637577916296, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84749984741211, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8568333387374878, "num_tokens": 483008074.0, "step": 12663 }, { "epoch": 1.6109909680702201, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07229995727539, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8523508906364441, "num_tokens": 483038920.0, "step": 12664 }, { "epoch": 1.6111181783488107, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.79109191894531, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8707712888717651, "num_tokens": 483075559.0, "step": 12665 }, { "epoch": 1.6112453886274012, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.850067138671875, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8644919395446777, "num_tokens": 483114378.0, "step": 12666 }, { "epoch": 1.6113725989059917, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.18669891357422, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8651837110519409, "num_tokens": 483147911.0, "step": 12667 }, { "epoch": 1.6114998091845822, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.977657318115234, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8741943836212158, "num_tokens": 483188275.0, "step": 12668 }, { "epoch": 1.6116270194631728, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2258415222168, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8726557493209839, "num_tokens": 483230804.0, "step": 12669 }, { "epoch": 1.6117542297417633, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.34379959106445, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8633183836936951, "num_tokens": 483262368.0, "step": 12670 }, { "epoch": 1.6118814400203536, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.049564361572266, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8490536212921143, "num_tokens": 483300894.0, "step": 12671 }, { "epoch": 1.6120086502989441, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.82749938964844, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8678749203681946, "num_tokens": 483337377.0, "step": 12672 }, { "epoch": 1.6121358605775347, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15549087524414, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8599944114685059, "num_tokens": 483376572.0, "step": 12673 }, { "epoch": 1.6122630708561252, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.3575553894043, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8491274118423462, "num_tokens": 483415584.0, "step": 12674 }, { "epoch": 1.6123902811347157, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.45813751220703, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8621333241462708, "num_tokens": 483459584.0, "step": 12675 }, { "epoch": 1.6125174914133062, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.28105926513672, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8543519973754883, "num_tokens": 483504260.0, "step": 12676 }, { "epoch": 1.6126447016918966, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.43190383911133, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8630349636077881, "num_tokens": 483547271.0, "step": 12677 }, { "epoch": 1.612771911970487, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.98530197143555, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8654624819755554, "num_tokens": 483582255.0, "step": 12678 }, { "epoch": 1.6128991222490776, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.41060256958008, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8548229932785034, "num_tokens": 483622290.0, "step": 12679 }, { "epoch": 1.6130263325276681, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.35626220703125, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8732361793518066, "num_tokens": 483657253.0, "step": 12680 }, { "epoch": 1.6131535428062587, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.54826736450195, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8657447099685669, "num_tokens": 483693062.0, "step": 12681 }, { "epoch": 1.6132807530848492, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.76582717895508, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.854406476020813, "num_tokens": 483735845.0, "step": 12682 }, { "epoch": 1.6134079633634397, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.00984191894531, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8640536069869995, "num_tokens": 483783833.0, "step": 12683 }, { "epoch": 1.6135351736420303, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.1133918762207, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8608900904655457, "num_tokens": 483820175.0, "step": 12684 }, { "epoch": 1.6136623839206208, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.029415130615234, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8715097308158875, "num_tokens": 483863454.0, "step": 12685 }, { "epoch": 1.6137895941992113, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.872283935546875, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8816211819648743, "num_tokens": 483900657.0, "step": 12686 }, { "epoch": 1.6139168044778018, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.313941955566406, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8556200861930847, "num_tokens": 483935912.0, "step": 12687 }, { "epoch": 1.6140440147563924, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.64448547363281, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8620150089263916, "num_tokens": 483968594.0, "step": 12688 }, { "epoch": 1.614171225034983, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.5481071472168, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.870823860168457, "num_tokens": 484010866.0, "step": 12689 }, { "epoch": 1.6142984353135734, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.68861770629883, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8565673828125, "num_tokens": 484051355.0, "step": 12690 }, { "epoch": 1.614425645592164, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.05892562866211, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8697873950004578, "num_tokens": 484087049.0, "step": 12691 }, { "epoch": 1.6145528558707545, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.76249313354492, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8508003950119019, "num_tokens": 484124914.0, "step": 12692 }, { "epoch": 1.614680066149345, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.06679916381836, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8761677742004395, "num_tokens": 484154524.0, "step": 12693 }, { "epoch": 1.6148072764279355, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.27873611450195, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8605983257293701, "num_tokens": 484195091.0, "step": 12694 }, { "epoch": 1.6149344867065258, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.46287155151367, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8636645078659058, "num_tokens": 484236787.0, "step": 12695 }, { "epoch": 1.6150616969851164, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31273651123047, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8557091355323792, "num_tokens": 484281748.0, "step": 12696 }, { "epoch": 1.615188907263707, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.85363006591797, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8682736158370972, "num_tokens": 484321181.0, "step": 12697 }, { "epoch": 1.6153161175422974, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.87049865722656, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8590488433837891, "num_tokens": 484360476.0, "step": 12698 }, { "epoch": 1.615443327820888, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.75521469116211, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8614634275436401, "num_tokens": 484395065.0, "step": 12699 }, { "epoch": 1.6155705380994785, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 41.766357421875, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8632800579071045, "num_tokens": 484439309.0, "step": 12700 }, { "epoch": 1.6156977483780688, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.115570068359375, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8618772029876709, "num_tokens": 484473018.0, "step": 12701 }, { "epoch": 1.6158249586566593, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.78779983520508, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.868386447429657, "num_tokens": 484518306.0, "step": 12702 }, { "epoch": 1.6159521689352498, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.490394592285156, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8477570414543152, "num_tokens": 484556348.0, "step": 12703 }, { "epoch": 1.6160793792138404, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.97411346435547, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8720102906227112, "num_tokens": 484593136.0, "step": 12704 }, { "epoch": 1.616206589492431, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.36347198486328, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8713093996047974, "num_tokens": 484628025.0, "step": 12705 }, { "epoch": 1.6163337997710214, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.13751220703125, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8674967288970947, "num_tokens": 484666703.0, "step": 12706 }, { "epoch": 1.616461010049612, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04966735839844, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8703083992004395, "num_tokens": 484703417.0, "step": 12707 }, { "epoch": 1.6165882203282025, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.85731506347656, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8757200241088867, "num_tokens": 484741830.0, "step": 12708 }, { "epoch": 1.616715430606793, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.306766510009766, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8615207672119141, "num_tokens": 484780140.0, "step": 12709 }, { "epoch": 1.6168426408853835, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.0124626159668, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8597565293312073, "num_tokens": 484813856.0, "step": 12710 }, { "epoch": 1.616969851163974, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.939613342285156, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8722068071365356, "num_tokens": 484851383.0, "step": 12711 }, { "epoch": 1.6170970614425646, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2838249206543, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8694153428077698, "num_tokens": 484884361.0, "step": 12712 }, { "epoch": 1.6172242717211551, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.602622985839844, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8419753909111023, "num_tokens": 484922106.0, "step": 12713 }, { "epoch": 1.6173514819997457, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.40003204345703, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8621116876602173, "num_tokens": 484963100.0, "step": 12714 }, { "epoch": 1.6174786922783362, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.06064987182617, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8740081191062927, "num_tokens": 485006896.0, "step": 12715 }, { "epoch": 1.6176059025569267, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.204280853271484, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8620872497558594, "num_tokens": 485045968.0, "step": 12716 }, { "epoch": 1.6177331128355172, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.138153076171875, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8595895171165466, "num_tokens": 485084736.0, "step": 12717 }, { "epoch": 1.6178603231141078, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.2685432434082, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8681445717811584, "num_tokens": 485123130.0, "step": 12718 }, { "epoch": 1.6179875333926983, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.84783935546875, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8595244884490967, "num_tokens": 485158072.0, "step": 12719 }, { "epoch": 1.6181147436712886, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.70609664916992, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8708847761154175, "num_tokens": 485196783.0, "step": 12720 }, { "epoch": 1.6182419539498791, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7498664855957, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8671576976776123, "num_tokens": 485231433.0, "step": 12721 }, { "epoch": 1.6183691642284697, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.446441650390625, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8598011136054993, "num_tokens": 485269016.0, "step": 12722 }, { "epoch": 1.6184963745070602, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.78755187988281, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8510475158691406, "num_tokens": 485308293.0, "step": 12723 }, { "epoch": 1.6186235847856507, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.29851531982422, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8706393837928772, "num_tokens": 485340758.0, "step": 12724 }, { "epoch": 1.6187507950642412, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.30752182006836, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8764554262161255, "num_tokens": 485384047.0, "step": 12725 }, { "epoch": 1.6188780053428315, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.60398483276367, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8641992807388306, "num_tokens": 485416909.0, "step": 12726 }, { "epoch": 1.619005215621422, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.295982360839844, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8607823848724365, "num_tokens": 485449532.0, "step": 12727 }, { "epoch": 1.6191324259000126, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.133575439453125, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8719440698623657, "num_tokens": 485491171.0, "step": 12728 }, { "epoch": 1.6192596361786031, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31039810180664, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8585852384567261, "num_tokens": 485525511.0, "step": 12729 }, { "epoch": 1.6193868464571937, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.302398681640625, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8578100800514221, "num_tokens": 485562517.0, "step": 12730 }, { "epoch": 1.6195140567357842, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.41063690185547, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8627129793167114, "num_tokens": 485601058.0, "step": 12731 }, { "epoch": 1.6196412670143747, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.24899673461914, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8749722242355347, "num_tokens": 485634737.0, "step": 12732 }, { "epoch": 1.6197684772929652, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.802547454833984, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8646602034568787, "num_tokens": 485671300.0, "step": 12733 }, { "epoch": 1.6198956875715558, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04501724243164, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8702023029327393, "num_tokens": 485708712.0, "step": 12734 }, { "epoch": 1.6200228978501463, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.39549255371094, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8740919232368469, "num_tokens": 485746820.0, "step": 12735 }, { "epoch": 1.6201501081287368, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.55463790893555, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8621987104415894, "num_tokens": 485785921.0, "step": 12736 }, { "epoch": 1.6202773184073274, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.004669189453125, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8635281920433044, "num_tokens": 485825766.0, "step": 12737 }, { "epoch": 1.6204045286859179, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.50550079345703, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8565139770507812, "num_tokens": 485862288.0, "step": 12738 }, { "epoch": 1.6205317389645084, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.32471466064453, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8567785024642944, "num_tokens": 485903582.0, "step": 12739 }, { "epoch": 1.620658949243099, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.83987045288086, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8583335280418396, "num_tokens": 485950931.0, "step": 12740 }, { "epoch": 1.6207861595216895, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.3637809753418, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8643521666526794, "num_tokens": 485984185.0, "step": 12741 }, { "epoch": 1.62091336980028, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.01677703857422, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8672677278518677, "num_tokens": 486018796.0, "step": 12742 }, { "epoch": 1.6210405800788705, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.592037200927734, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8487653732299805, "num_tokens": 486055328.0, "step": 12743 }, { "epoch": 1.6211677903574608, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.667236328125, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8556067943572998, "num_tokens": 486092340.0, "step": 12744 }, { "epoch": 1.6212950006360514, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.59310531616211, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8494757413864136, "num_tokens": 486123195.0, "step": 12745 }, { "epoch": 1.621422210914642, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2307243347168, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8722569346427917, "num_tokens": 486167145.0, "step": 12746 }, { "epoch": 1.6215494211932324, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.48957443237305, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.854911744594574, "num_tokens": 486207925.0, "step": 12747 }, { "epoch": 1.621676631471823, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.00667953491211, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8657916188240051, "num_tokens": 486245446.0, "step": 12748 }, { "epoch": 1.6218038417504135, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.61400604248047, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.862596869468689, "num_tokens": 486280461.0, "step": 12749 }, { "epoch": 1.6219310520290038, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1551513671875, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8493877649307251, "num_tokens": 486323926.0, "step": 12750 }, { "epoch": 1.6220582623075943, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.210872650146484, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8550834655761719, "num_tokens": 486360016.0, "step": 12751 }, { "epoch": 1.6221854725861848, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.66853713989258, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8652013540267944, "num_tokens": 486394994.0, "step": 12752 }, { "epoch": 1.6223126828647754, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.890262603759766, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8604834079742432, "num_tokens": 486436909.0, "step": 12753 }, { "epoch": 1.622439893143366, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.9736328125, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.869062066078186, "num_tokens": 486473201.0, "step": 12754 }, { "epoch": 1.6225671034219564, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.8472785949707, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8423124551773071, "num_tokens": 486510733.0, "step": 12755 }, { "epoch": 1.622694313700547, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.97462463378906, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8786237835884094, "num_tokens": 486546666.0, "step": 12756 }, { "epoch": 1.6228215239791375, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.99131774902344, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8424965143203735, "num_tokens": 486585639.0, "step": 12757 }, { "epoch": 1.622948734257728, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.876583099365234, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8663828372955322, "num_tokens": 486623028.0, "step": 12758 }, { "epoch": 1.6230759445363185, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.652099609375, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8567108511924744, "num_tokens": 486655906.0, "step": 12759 }, { "epoch": 1.623203154814909, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.62677001953125, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8593823313713074, "num_tokens": 486689461.0, "step": 12760 }, { "epoch": 1.6233303650934996, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07141876220703, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8448677659034729, "num_tokens": 486728373.0, "step": 12761 }, { "epoch": 1.6234575753720901, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.68221664428711, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8729112148284912, "num_tokens": 486769955.0, "step": 12762 }, { "epoch": 1.6235847856506807, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.36913299560547, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8665229082107544, "num_tokens": 486806468.0, "step": 12763 }, { "epoch": 1.6237119959292712, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.398582458496094, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.847984790802002, "num_tokens": 486846526.0, "step": 12764 }, { "epoch": 1.6238392062078617, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.544036865234375, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8788450956344604, "num_tokens": 486886323.0, "step": 12765 }, { "epoch": 1.6239664164864522, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.254878997802734, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8427387475967407, "num_tokens": 486918099.0, "step": 12766 }, { "epoch": 1.6240936267650428, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.47580337524414, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.875828742980957, "num_tokens": 486953789.0, "step": 12767 }, { "epoch": 1.6242208370436333, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.93625259399414, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8610068559646606, "num_tokens": 486996165.0, "step": 12768 }, { "epoch": 1.6243480473222236, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07389450073242, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8725042343139648, "num_tokens": 487028249.0, "step": 12769 }, { "epoch": 1.6244752576008141, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.226951599121094, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8619495034217834, "num_tokens": 487070099.0, "step": 12770 }, { "epoch": 1.6246024678794047, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.21534729003906, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8650202751159668, "num_tokens": 487107117.0, "step": 12771 }, { "epoch": 1.6247296781579952, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.20957946777344, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.849965512752533, "num_tokens": 487146024.0, "step": 12772 }, { "epoch": 1.6248568884365857, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.10262680053711, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8480769395828247, "num_tokens": 487189864.0, "step": 12773 }, { "epoch": 1.6249840987151762, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.33221435546875, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8632861971855164, "num_tokens": 487230307.0, "step": 12774 }, { "epoch": 1.6251113089937665, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.820770263671875, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.864717423915863, "num_tokens": 487261387.0, "step": 12775 }, { "epoch": 1.625238519272357, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.30222702026367, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8508023619651794, "num_tokens": 487298611.0, "step": 12776 }, { "epoch": 1.6253657295509476, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.7315559387207, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8607070446014404, "num_tokens": 487338172.0, "step": 12777 }, { "epoch": 1.6254929398295381, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.26742172241211, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8632228374481201, "num_tokens": 487375126.0, "step": 12778 }, { "epoch": 1.6256201501081287, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.958763122558594, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8732767105102539, "num_tokens": 487416539.0, "step": 12779 }, { "epoch": 1.6257473603867192, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.16740798950195, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8639251589775085, "num_tokens": 487448429.0, "step": 12780 }, { "epoch": 1.6258745706653097, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.981937408447266, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8610122203826904, "num_tokens": 487486054.0, "step": 12781 }, { "epoch": 1.6260017809439002, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.147010803222656, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8689687848091125, "num_tokens": 487523771.0, "step": 12782 }, { "epoch": 1.6261289912224908, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.19088363647461, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8677551746368408, "num_tokens": 487561334.0, "step": 12783 }, { "epoch": 1.6262562015010813, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.18993377685547, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8629680871963501, "num_tokens": 487606382.0, "step": 12784 }, { "epoch": 1.6263834117796718, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.00202178955078, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8581041097640991, "num_tokens": 487647640.0, "step": 12785 }, { "epoch": 1.6265106220582624, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.55424499511719, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8604781627655029, "num_tokens": 487682604.0, "step": 12786 }, { "epoch": 1.6266378323368529, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.973960876464844, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8654887080192566, "num_tokens": 487718271.0, "step": 12787 }, { "epoch": 1.6267650426154434, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.450382232666016, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8477048277854919, "num_tokens": 487752872.0, "step": 12788 }, { "epoch": 1.626892252894034, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.47980880737305, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.851681113243103, "num_tokens": 487798309.0, "step": 12789 }, { "epoch": 1.6270194631726245, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.89567184448242, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.850886881351471, "num_tokens": 487834052.0, "step": 12790 }, { "epoch": 1.627146673451215, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.988548278808594, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8507905602455139, "num_tokens": 487868662.0, "step": 12791 }, { "epoch": 1.6272738837298055, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.379268646240234, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8583724498748779, "num_tokens": 487907425.0, "step": 12792 }, { "epoch": 1.6274010940083958, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.7432975769043, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8704768419265747, "num_tokens": 487942979.0, "step": 12793 }, { "epoch": 1.6275283042869864, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.09150695800781, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8655445575714111, "num_tokens": 487982402.0, "step": 12794 }, { "epoch": 1.6276555145655769, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.738948822021484, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8694761395454407, "num_tokens": 488020071.0, "step": 12795 }, { "epoch": 1.6277827248441674, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.23497772216797, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8477309942245483, "num_tokens": 488057457.0, "step": 12796 }, { "epoch": 1.627909935122758, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.830963134765625, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8583320379257202, "num_tokens": 488094002.0, "step": 12797 }, { "epoch": 1.6280371454013485, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.88671112060547, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8473069667816162, "num_tokens": 488134995.0, "step": 12798 }, { "epoch": 1.6281643556799388, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.38076400756836, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8643926978111267, "num_tokens": 488169942.0, "step": 12799 }, { "epoch": 1.6282915659585293, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.92034912109375, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8635240793228149, "num_tokens": 488210852.0, "step": 12800 }, { "epoch": 1.6284187762371198, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.59982681274414, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.865856409072876, "num_tokens": 488253067.0, "step": 12801 }, { "epoch": 1.6285459865157104, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.82748031616211, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8705543875694275, "num_tokens": 488293498.0, "step": 12802 }, { "epoch": 1.628673196794301, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.499305725097656, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.864413321018219, "num_tokens": 488333931.0, "step": 12803 }, { "epoch": 1.6288004070728914, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.452762603759766, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8677763938903809, "num_tokens": 488371724.0, "step": 12804 }, { "epoch": 1.628927617351482, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.7607307434082, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8572021126747131, "num_tokens": 488415171.0, "step": 12805 }, { "epoch": 1.6290548276300725, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.87964630126953, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8562803268432617, "num_tokens": 488451672.0, "step": 12806 }, { "epoch": 1.629182037908663, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.580841064453125, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8771002888679504, "num_tokens": 488492657.0, "step": 12807 }, { "epoch": 1.6293092481872535, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.76258087158203, "learning_rate": 1e-06, "loss": 0.5308, "mean_token_accuracy": 0.8683647513389587, "num_tokens": 488530681.0, "step": 12808 }, { "epoch": 1.629436458465844, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.49529266357422, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8710858821868896, "num_tokens": 488560542.0, "step": 12809 }, { "epoch": 1.6295636687444346, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.83854293823242, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8635041117668152, "num_tokens": 488597491.0, "step": 12810 }, { "epoch": 1.6296908790230251, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.256141662597656, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8688299059867859, "num_tokens": 488634556.0, "step": 12811 }, { "epoch": 1.6298180893016156, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.73295974731445, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8579473495483398, "num_tokens": 488668174.0, "step": 12812 }, { "epoch": 1.6299452995802062, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.482295989990234, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8662563562393188, "num_tokens": 488705994.0, "step": 12813 }, { "epoch": 1.6300725098587967, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.125492095947266, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8507966995239258, "num_tokens": 488746908.0, "step": 12814 }, { "epoch": 1.6301997201373872, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.694820404052734, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.863461971282959, "num_tokens": 488785607.0, "step": 12815 }, { "epoch": 1.6303269304159778, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.75209426879883, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8596162796020508, "num_tokens": 488823831.0, "step": 12816 }, { "epoch": 1.630454140694568, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.883636474609375, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8688138723373413, "num_tokens": 488859944.0, "step": 12817 }, { "epoch": 1.6305813509731586, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.1756591796875, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8464610576629639, "num_tokens": 488903764.0, "step": 12818 }, { "epoch": 1.6307085612517491, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.68424606323242, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8522587418556213, "num_tokens": 488946890.0, "step": 12819 }, { "epoch": 1.6308357715303397, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.69931411743164, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8577197790145874, "num_tokens": 488981901.0, "step": 12820 }, { "epoch": 1.6309629818089302, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.63220977783203, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8639522790908813, "num_tokens": 489024180.0, "step": 12821 }, { "epoch": 1.6310901920875207, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.301937103271484, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8652182817459106, "num_tokens": 489070971.0, "step": 12822 }, { "epoch": 1.6312174023661112, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.71028137207031, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8722705841064453, "num_tokens": 489111636.0, "step": 12823 }, { "epoch": 1.6313446126447015, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.843650817871094, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.859634518623352, "num_tokens": 489148677.0, "step": 12824 }, { "epoch": 1.631471822923292, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.43754196166992, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8487828969955444, "num_tokens": 489186222.0, "step": 12825 }, { "epoch": 1.6315990332018826, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.6867561340332, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8350416421890259, "num_tokens": 489223173.0, "step": 12826 }, { "epoch": 1.6317262434804731, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.2358512878418, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8624430894851685, "num_tokens": 489259236.0, "step": 12827 }, { "epoch": 1.6318534537590637, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.38340759277344, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.854516863822937, "num_tokens": 489298449.0, "step": 12828 }, { "epoch": 1.6319806640376542, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.082149505615234, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.8399028182029724, "num_tokens": 489334800.0, "step": 12829 }, { "epoch": 1.6321078743162447, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.0020637512207, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8682960271835327, "num_tokens": 489378510.0, "step": 12830 }, { "epoch": 1.6322350845948352, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.83418273925781, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8497763872146606, "num_tokens": 489419267.0, "step": 12831 }, { "epoch": 1.6323622948734258, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.212242126464844, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8638710975646973, "num_tokens": 489454772.0, "step": 12832 }, { "epoch": 1.6324895051520163, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.59431457519531, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8550419807434082, "num_tokens": 489490146.0, "step": 12833 }, { "epoch": 1.6326167154306068, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.260074615478516, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8316766619682312, "num_tokens": 489532754.0, "step": 12834 }, { "epoch": 1.6327439257091974, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.7846794128418, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8589774966239929, "num_tokens": 489570378.0, "step": 12835 }, { "epoch": 1.6328711359877879, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.16872787475586, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8522552847862244, "num_tokens": 489604562.0, "step": 12836 }, { "epoch": 1.6329983462663784, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.92668533325195, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8684033155441284, "num_tokens": 489644060.0, "step": 12837 }, { "epoch": 1.633125556544969, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04256057739258, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8631203174591064, "num_tokens": 489682693.0, "step": 12838 }, { "epoch": 1.6332527668235595, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.03805160522461, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8699502348899841, "num_tokens": 489720786.0, "step": 12839 }, { "epoch": 1.63337997710215, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.3453369140625, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8673081398010254, "num_tokens": 489758773.0, "step": 12840 }, { "epoch": 1.6335071873807405, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.63685607910156, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8623641729354858, "num_tokens": 489793436.0, "step": 12841 }, { "epoch": 1.6336343976593308, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15348815917969, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8800439238548279, "num_tokens": 489828634.0, "step": 12842 }, { "epoch": 1.6337616079379214, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.198204040527344, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8647681474685669, "num_tokens": 489866335.0, "step": 12843 }, { "epoch": 1.6338888182165119, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.166358947753906, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8689299821853638, "num_tokens": 489903059.0, "step": 12844 }, { "epoch": 1.6340160284951024, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.615684509277344, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8696500062942505, "num_tokens": 489940186.0, "step": 12845 }, { "epoch": 1.634143238773693, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.23129653930664, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.873989462852478, "num_tokens": 489984732.0, "step": 12846 }, { "epoch": 1.6342704490522835, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.26435470581055, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8611505627632141, "num_tokens": 490016699.0, "step": 12847 }, { "epoch": 1.6343976593308738, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.233707427978516, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8550768494606018, "num_tokens": 490051351.0, "step": 12848 }, { "epoch": 1.6345248696094643, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.45847702026367, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.87077397108078, "num_tokens": 490089668.0, "step": 12849 }, { "epoch": 1.6346520798880548, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.03263473510742, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8620935678482056, "num_tokens": 490129651.0, "step": 12850 }, { "epoch": 1.6347792901666454, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.48081588745117, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.853589653968811, "num_tokens": 490168441.0, "step": 12851 }, { "epoch": 1.6349065004452359, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31890106201172, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8712291717529297, "num_tokens": 490204754.0, "step": 12852 }, { "epoch": 1.6350337107238264, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 41.971309661865234, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8764574527740479, "num_tokens": 490244178.0, "step": 12853 }, { "epoch": 1.635160921002417, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.27168273925781, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8580993413925171, "num_tokens": 490281161.0, "step": 12854 }, { "epoch": 1.6352881312810075, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.252079010009766, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8632036447525024, "num_tokens": 490314892.0, "step": 12855 }, { "epoch": 1.635415341559598, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.98743438720703, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8480864763259888, "num_tokens": 490352600.0, "step": 12856 }, { "epoch": 1.6355425518381885, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.43332290649414, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8733874559402466, "num_tokens": 490384487.0, "step": 12857 }, { "epoch": 1.635669762116779, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.95484924316406, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8416000604629517, "num_tokens": 490422199.0, "step": 12858 }, { "epoch": 1.6357969723953696, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.569488525390625, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8536717295646667, "num_tokens": 490458058.0, "step": 12859 }, { "epoch": 1.6359241826739601, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.14681625366211, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8733690977096558, "num_tokens": 490498447.0, "step": 12860 }, { "epoch": 1.6360513929525506, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.82898712158203, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8491964340209961, "num_tokens": 490538502.0, "step": 12861 }, { "epoch": 1.6361786032311412, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.261077880859375, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8659536838531494, "num_tokens": 490575752.0, "step": 12862 }, { "epoch": 1.6363058135097317, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.63228225708008, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8616758584976196, "num_tokens": 490613502.0, "step": 12863 }, { "epoch": 1.6364330237883222, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.5078125, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8557493686676025, "num_tokens": 490653820.0, "step": 12864 }, { "epoch": 1.6365602340669128, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.03067398071289, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8464139699935913, "num_tokens": 490693332.0, "step": 12865 }, { "epoch": 1.636687444345503, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.23530960083008, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8599136471748352, "num_tokens": 490733135.0, "step": 12866 }, { "epoch": 1.6368146546240936, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.58592987060547, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8642685413360596, "num_tokens": 490775505.0, "step": 12867 }, { "epoch": 1.6369418649026841, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.24325180053711, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.861040472984314, "num_tokens": 490817783.0, "step": 12868 }, { "epoch": 1.6370690751812746, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.6495246887207, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8561232686042786, "num_tokens": 490849464.0, "step": 12869 }, { "epoch": 1.6371962854598652, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.20899963378906, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8546343445777893, "num_tokens": 490891925.0, "step": 12870 }, { "epoch": 1.6373234957384557, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.01174545288086, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8533250093460083, "num_tokens": 490926221.0, "step": 12871 }, { "epoch": 1.6374507060170462, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.3510856628418, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.853053629398346, "num_tokens": 490964141.0, "step": 12872 }, { "epoch": 1.6375779162956365, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.292274475097656, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8563463091850281, "num_tokens": 490997917.0, "step": 12873 }, { "epoch": 1.637705126574227, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.479949951171875, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8751067519187927, "num_tokens": 491029244.0, "step": 12874 }, { "epoch": 1.6378323368528176, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.244544982910156, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8609380722045898, "num_tokens": 491066591.0, "step": 12875 }, { "epoch": 1.6379595471314081, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.12439727783203, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8606326580047607, "num_tokens": 491106546.0, "step": 12876 }, { "epoch": 1.6380867574099987, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.42009353637695, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8535436987876892, "num_tokens": 491145512.0, "step": 12877 }, { "epoch": 1.6382139676885892, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.362300872802734, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8606535196304321, "num_tokens": 491185624.0, "step": 12878 }, { "epoch": 1.6383411779671797, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.750946044921875, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8555387854576111, "num_tokens": 491220982.0, "step": 12879 }, { "epoch": 1.6384683882457702, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.31256103515625, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8635953664779663, "num_tokens": 491262501.0, "step": 12880 }, { "epoch": 1.6385955985243608, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.50585174560547, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8715563416481018, "num_tokens": 491298078.0, "step": 12881 }, { "epoch": 1.6387228088029513, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.54817199707031, "learning_rate": 1e-06, "loss": 0.5284, "mean_token_accuracy": 0.8692154288291931, "num_tokens": 491336203.0, "step": 12882 }, { "epoch": 1.6388500190815418, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.88344955444336, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8502224683761597, "num_tokens": 491374723.0, "step": 12883 }, { "epoch": 1.6389772293601323, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.956275939941406, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8621397614479065, "num_tokens": 491410546.0, "step": 12884 }, { "epoch": 1.6391044396387229, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.60251235961914, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8570542335510254, "num_tokens": 491450642.0, "step": 12885 }, { "epoch": 1.6392316499173134, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.27422332763672, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8575458526611328, "num_tokens": 491489893.0, "step": 12886 }, { "epoch": 1.639358860195904, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.239158630371094, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8501825332641602, "num_tokens": 491535870.0, "step": 12887 }, { "epoch": 1.6394860704744945, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.3947868347168, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8592190146446228, "num_tokens": 491575988.0, "step": 12888 }, { "epoch": 1.639613280753085, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.1235237121582, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8637564778327942, "num_tokens": 491616822.0, "step": 12889 }, { "epoch": 1.6397404910316755, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.59892654418945, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8801666498184204, "num_tokens": 491654132.0, "step": 12890 }, { "epoch": 1.6398677013102658, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.17247009277344, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8477776646614075, "num_tokens": 491697215.0, "step": 12891 }, { "epoch": 1.6399949115888564, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.544979095458984, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8393827676773071, "num_tokens": 491731338.0, "step": 12892 }, { "epoch": 1.6401221218674469, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.141845703125, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8581216931343079, "num_tokens": 491770808.0, "step": 12893 }, { "epoch": 1.6402493321460374, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.63835144042969, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.868405818939209, "num_tokens": 491804167.0, "step": 12894 }, { "epoch": 1.640376542424628, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.52091979980469, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8631160259246826, "num_tokens": 491840094.0, "step": 12895 }, { "epoch": 1.6405037527032185, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.39768981933594, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8486940860748291, "num_tokens": 491881643.0, "step": 12896 }, { "epoch": 1.6406309629818088, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.407657623291016, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8482871651649475, "num_tokens": 491920951.0, "step": 12897 }, { "epoch": 1.6407581732603993, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.498600006103516, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8563863039016724, "num_tokens": 491954599.0, "step": 12898 }, { "epoch": 1.6408853835389898, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.18372344970703, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8635110855102539, "num_tokens": 491995375.0, "step": 12899 }, { "epoch": 1.6410125938175804, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.29216766357422, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8567808866500854, "num_tokens": 492035297.0, "step": 12900 }, { "epoch": 1.6411398040961709, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.34858322143555, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8614544868469238, "num_tokens": 492071003.0, "step": 12901 }, { "epoch": 1.6412670143747614, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.94283676147461, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8612515926361084, "num_tokens": 492106046.0, "step": 12902 }, { "epoch": 1.641394224653352, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.357643127441406, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8733102083206177, "num_tokens": 492147252.0, "step": 12903 }, { "epoch": 1.6415214349319425, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.06259536743164, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8510217666625977, "num_tokens": 492188185.0, "step": 12904 }, { "epoch": 1.641648645210533, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.33992004394531, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8654552102088928, "num_tokens": 492219630.0, "step": 12905 }, { "epoch": 1.6417758554891235, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.66880416870117, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8627138137817383, "num_tokens": 492251955.0, "step": 12906 }, { "epoch": 1.641903065767714, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.22282409667969, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8638743162155151, "num_tokens": 492293484.0, "step": 12907 }, { "epoch": 1.6420302760463046, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.683692932128906, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8479213118553162, "num_tokens": 492331169.0, "step": 12908 }, { "epoch": 1.642157486324895, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.70553970336914, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.867957592010498, "num_tokens": 492366071.0, "step": 12909 }, { "epoch": 1.6422846966034856, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.55349349975586, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.881414532661438, "num_tokens": 492396820.0, "step": 12910 }, { "epoch": 1.6424119068820762, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.44306945800781, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8681955337524414, "num_tokens": 492435282.0, "step": 12911 }, { "epoch": 1.6425391171606667, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.074424743652344, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8680856823921204, "num_tokens": 492470195.0, "step": 12912 }, { "epoch": 1.6426663274392572, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.21339797973633, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8542361259460449, "num_tokens": 492514265.0, "step": 12913 }, { "epoch": 1.6427935377178478, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.09370803833008, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8633296489715576, "num_tokens": 492550348.0, "step": 12914 }, { "epoch": 1.642920747996438, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.514747619628906, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8654208183288574, "num_tokens": 492590648.0, "step": 12915 }, { "epoch": 1.6430479582750286, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.18503189086914, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.864874005317688, "num_tokens": 492629523.0, "step": 12916 }, { "epoch": 1.6431751685536191, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.092498779296875, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.857412576675415, "num_tokens": 492668392.0, "step": 12917 }, { "epoch": 1.6433023788322096, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.089385986328125, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8496322631835938, "num_tokens": 492696073.0, "step": 12918 }, { "epoch": 1.6434295891108002, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.974510192871094, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8627605438232422, "num_tokens": 492728590.0, "step": 12919 }, { "epoch": 1.6435567993893907, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.11873245239258, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8557816743850708, "num_tokens": 492767290.0, "step": 12920 }, { "epoch": 1.6436840096679812, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 41.98194122314453, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.87214195728302, "num_tokens": 492801019.0, "step": 12921 }, { "epoch": 1.6438112199465715, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.59477996826172, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8603750467300415, "num_tokens": 492841220.0, "step": 12922 }, { "epoch": 1.643938430225162, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.09934997558594, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8507895469665527, "num_tokens": 492885076.0, "step": 12923 }, { "epoch": 1.6440656405037526, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.95684051513672, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8552379608154297, "num_tokens": 492918554.0, "step": 12924 }, { "epoch": 1.6441928507823431, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 41.761634826660156, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8768947720527649, "num_tokens": 492961890.0, "step": 12925 }, { "epoch": 1.6443200610609336, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.86245346069336, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8716074228286743, "num_tokens": 492997768.0, "step": 12926 }, { "epoch": 1.6444472713395242, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 41.9624137878418, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8858843445777893, "num_tokens": 493037017.0, "step": 12927 }, { "epoch": 1.6445744816181147, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.06575012207031, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8719604015350342, "num_tokens": 493074579.0, "step": 12928 }, { "epoch": 1.6447016918967052, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.74292755126953, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8620818257331848, "num_tokens": 493111249.0, "step": 12929 }, { "epoch": 1.6448289021752958, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.14079284667969, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.84554523229599, "num_tokens": 493150723.0, "step": 12930 }, { "epoch": 1.6449561124538863, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.8002815246582, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8577908873558044, "num_tokens": 493189530.0, "step": 12931 }, { "epoch": 1.6450833227324768, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.01033401489258, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8674747943878174, "num_tokens": 493231369.0, "step": 12932 }, { "epoch": 1.6452105330110673, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.232872009277344, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8481186628341675, "num_tokens": 493278430.0, "step": 12933 }, { "epoch": 1.6453377432896579, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.0077018737793, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8666478395462036, "num_tokens": 493312258.0, "step": 12934 }, { "epoch": 1.6454649535682484, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.32142639160156, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8592684268951416, "num_tokens": 493347725.0, "step": 12935 }, { "epoch": 1.645592163846839, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.132476806640625, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8638532161712646, "num_tokens": 493388119.0, "step": 12936 }, { "epoch": 1.6457193741254295, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.10503005981445, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8623249530792236, "num_tokens": 493424607.0, "step": 12937 }, { "epoch": 1.64584658440402, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 41.84926986694336, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8386695384979248, "num_tokens": 493459546.0, "step": 12938 }, { "epoch": 1.6459737946826105, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.15980529785156, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8659136891365051, "num_tokens": 493494162.0, "step": 12939 }, { "epoch": 1.6461010049612008, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.11245346069336, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8581018447875977, "num_tokens": 493537523.0, "step": 12940 }, { "epoch": 1.6462282152397913, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.09860610961914, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8602358102798462, "num_tokens": 493580065.0, "step": 12941 }, { "epoch": 1.6463554255183819, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.18233871459961, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.869145393371582, "num_tokens": 493626717.0, "step": 12942 }, { "epoch": 1.6464826357969724, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.21614074707031, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8669183254241943, "num_tokens": 493666266.0, "step": 12943 }, { "epoch": 1.646609846075563, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.32912063598633, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8504416346549988, "num_tokens": 493703184.0, "step": 12944 }, { "epoch": 1.6467370563541535, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.1848030090332, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8768173456192017, "num_tokens": 493744061.0, "step": 12945 }, { "epoch": 1.6468642666327438, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.81035614013672, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8614221215248108, "num_tokens": 493783585.0, "step": 12946 }, { "epoch": 1.6469914769113343, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 41.84430694580078, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8680223226547241, "num_tokens": 493827781.0, "step": 12947 }, { "epoch": 1.6471186871899248, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.06304168701172, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8648107051849365, "num_tokens": 493864418.0, "step": 12948 }, { "epoch": 1.6472458974685154, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.96330261230469, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8603661060333252, "num_tokens": 493895226.0, "step": 12949 }, { "epoch": 1.6473731077471059, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.20079040527344, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8529601097106934, "num_tokens": 493935210.0, "step": 12950 }, { "epoch": 1.6475003180256964, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.274024963378906, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.860693097114563, "num_tokens": 493966368.0, "step": 12951 }, { "epoch": 1.647627528304287, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.14521408081055, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8540441989898682, "num_tokens": 494004304.0, "step": 12952 }, { "epoch": 1.6477547385828775, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.67138671875, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.867970883846283, "num_tokens": 494044758.0, "step": 12953 }, { "epoch": 1.647881948861468, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.924095153808594, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8461774587631226, "num_tokens": 494080624.0, "step": 12954 }, { "epoch": 1.6480091591400585, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.81623458862305, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8672980070114136, "num_tokens": 494116215.0, "step": 12955 }, { "epoch": 1.648136369418649, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.43500518798828, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8601251840591431, "num_tokens": 494150317.0, "step": 12956 }, { "epoch": 1.6482635796972396, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.88742446899414, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.874434769153595, "num_tokens": 494187273.0, "step": 12957 }, { "epoch": 1.64839078997583, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.07118606567383, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8579065799713135, "num_tokens": 494226921.0, "step": 12958 }, { "epoch": 1.6485180002544206, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.765506744384766, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8662396669387817, "num_tokens": 494263141.0, "step": 12959 }, { "epoch": 1.6486452105330112, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.138980865478516, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.855732798576355, "num_tokens": 494307925.0, "step": 12960 }, { "epoch": 1.6487724208116017, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.852561950683594, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8547983765602112, "num_tokens": 494353428.0, "step": 12961 }, { "epoch": 1.6488996310901922, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.536888122558594, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.848933219909668, "num_tokens": 494383559.0, "step": 12962 }, { "epoch": 1.6490268413687827, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.41619873046875, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8639119863510132, "num_tokens": 494419564.0, "step": 12963 }, { "epoch": 1.649154051647373, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.37633514404297, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8652118444442749, "num_tokens": 494466886.0, "step": 12964 }, { "epoch": 1.6492812619259636, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.02293395996094, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8444056510925293, "num_tokens": 494506672.0, "step": 12965 }, { "epoch": 1.649408472204554, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.641746520996094, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8611060976982117, "num_tokens": 494541881.0, "step": 12966 }, { "epoch": 1.6495356824831446, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.38275146484375, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8669267892837524, "num_tokens": 494583546.0, "step": 12967 }, { "epoch": 1.6496628927617352, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.628604888916016, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8494096994400024, "num_tokens": 494616625.0, "step": 12968 }, { "epoch": 1.6497901030403257, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.0911979675293, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8759074211120605, "num_tokens": 494654590.0, "step": 12969 }, { "epoch": 1.6499173133189162, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.78154373168945, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8761468529701233, "num_tokens": 494692471.0, "step": 12970 }, { "epoch": 1.6500445235975065, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.51298522949219, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8530155420303345, "num_tokens": 494730347.0, "step": 12971 }, { "epoch": 1.650171733876097, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.48584747314453, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8486589193344116, "num_tokens": 494763753.0, "step": 12972 }, { "epoch": 1.6502989441546876, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.559242248535156, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8555294871330261, "num_tokens": 494800416.0, "step": 12973 }, { "epoch": 1.6504261544332781, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.41907501220703, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8608150482177734, "num_tokens": 494842733.0, "step": 12974 }, { "epoch": 1.6505533647118686, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.45745849609375, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8538795113563538, "num_tokens": 494878661.0, "step": 12975 }, { "epoch": 1.6506805749904592, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.06856155395508, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8542616963386536, "num_tokens": 494918142.0, "step": 12976 }, { "epoch": 1.6508077852690497, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.168819427490234, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8608995079994202, "num_tokens": 494956947.0, "step": 12977 }, { "epoch": 1.6509349955476402, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.050106048583984, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8562520742416382, "num_tokens": 494989850.0, "step": 12978 }, { "epoch": 1.6510622058262308, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.92481231689453, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8522999286651611, "num_tokens": 495026770.0, "step": 12979 }, { "epoch": 1.6511894161048213, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.24432373046875, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8726086616516113, "num_tokens": 495065009.0, "step": 12980 }, { "epoch": 1.6513166263834118, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.28565979003906, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8616801500320435, "num_tokens": 495096914.0, "step": 12981 }, { "epoch": 1.6514438366620023, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.293487548828125, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8656117916107178, "num_tokens": 495138348.0, "step": 12982 }, { "epoch": 1.6515710469405929, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.20906448364258, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8552356958389282, "num_tokens": 495177026.0, "step": 12983 }, { "epoch": 1.6516982572191834, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.355831146240234, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8631073832511902, "num_tokens": 495213850.0, "step": 12984 }, { "epoch": 1.651825467497774, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.14802932739258, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8704109191894531, "num_tokens": 495257213.0, "step": 12985 }, { "epoch": 1.6519526777763645, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.65070343017578, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8515645265579224, "num_tokens": 495300018.0, "step": 12986 }, { "epoch": 1.652079888054955, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15680694580078, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8599722385406494, "num_tokens": 495336471.0, "step": 12987 }, { "epoch": 1.6522070983335455, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.74863815307617, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8637396693229675, "num_tokens": 495374680.0, "step": 12988 }, { "epoch": 1.6523343086121358, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.21895217895508, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8630346655845642, "num_tokens": 495414096.0, "step": 12989 }, { "epoch": 1.6524615188907263, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.194541931152344, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8507263660430908, "num_tokens": 495452955.0, "step": 12990 }, { "epoch": 1.6525887291693169, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.964149475097656, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.858313798904419, "num_tokens": 495490091.0, "step": 12991 }, { "epoch": 1.6527159394479074, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.4886589050293, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8686740398406982, "num_tokens": 495524543.0, "step": 12992 }, { "epoch": 1.652843149726498, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.97755813598633, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8515411615371704, "num_tokens": 495562068.0, "step": 12993 }, { "epoch": 1.6529703600050885, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.551124572753906, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8688788414001465, "num_tokens": 495598355.0, "step": 12994 }, { "epoch": 1.6530975702836788, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.11650848388672, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8465153574943542, "num_tokens": 495634735.0, "step": 12995 }, { "epoch": 1.6532247805622693, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.007545471191406, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8607838749885559, "num_tokens": 495668804.0, "step": 12996 }, { "epoch": 1.6533519908408598, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.742183685302734, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8651589155197144, "num_tokens": 495707129.0, "step": 12997 }, { "epoch": 1.6534792011194503, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.8088493347168, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8415554761886597, "num_tokens": 495746501.0, "step": 12998 }, { "epoch": 1.6536064113980409, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.844539642333984, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8775100708007812, "num_tokens": 495789723.0, "step": 12999 }, { "epoch": 1.6537336216766314, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.57924270629883, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8532900810241699, "num_tokens": 495827073.0, "step": 13000 }, { "epoch": 1.653860831955222, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.38727569580078, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8496913909912109, "num_tokens": 495862362.0, "step": 13001 }, { "epoch": 1.6539880422338125, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.253055572509766, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8637166023254395, "num_tokens": 495905546.0, "step": 13002 }, { "epoch": 1.654115252512403, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.296016693115234, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.858214259147644, "num_tokens": 495942291.0, "step": 13003 }, { "epoch": 1.6542424627909935, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.23908996582031, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8506712913513184, "num_tokens": 495978631.0, "step": 13004 }, { "epoch": 1.654369673069584, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.06169509887695, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8556681275367737, "num_tokens": 496016626.0, "step": 13005 }, { "epoch": 1.6544968833481746, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.324703216552734, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8521589636802673, "num_tokens": 496051949.0, "step": 13006 }, { "epoch": 1.654624093626765, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.26024627685547, "learning_rate": 1e-06, "loss": 0.5406, "mean_token_accuracy": 0.8665553331375122, "num_tokens": 496092693.0, "step": 13007 }, { "epoch": 1.6547513039053556, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.25273513793945, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8556599617004395, "num_tokens": 496132853.0, "step": 13008 }, { "epoch": 1.6548785141839462, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.271385192871094, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8456614017486572, "num_tokens": 496168303.0, "step": 13009 }, { "epoch": 1.6550057244625367, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.49714660644531, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8478003144264221, "num_tokens": 496206132.0, "step": 13010 }, { "epoch": 1.6551329347411272, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.24834442138672, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8541598916053772, "num_tokens": 496247542.0, "step": 13011 }, { "epoch": 1.6552601450197177, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.93701171875, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.871737539768219, "num_tokens": 496285698.0, "step": 13012 }, { "epoch": 1.655387355298308, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.378936767578125, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8619217872619629, "num_tokens": 496322216.0, "step": 13013 }, { "epoch": 1.6555145655768986, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.889007568359375, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8573482036590576, "num_tokens": 496357687.0, "step": 13014 }, { "epoch": 1.655641775855489, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.487327575683594, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8734085559844971, "num_tokens": 496400875.0, "step": 13015 }, { "epoch": 1.6557689861340796, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.817195892333984, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.862655520439148, "num_tokens": 496432076.0, "step": 13016 }, { "epoch": 1.6558961964126702, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.499847412109375, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8584007620811462, "num_tokens": 496466852.0, "step": 13017 }, { "epoch": 1.6560234066912607, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.93656921386719, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8585809469223022, "num_tokens": 496508890.0, "step": 13018 }, { "epoch": 1.6561506169698512, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.382808685302734, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8756629228591919, "num_tokens": 496537964.0, "step": 13019 }, { "epoch": 1.6562778272484415, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.82062911987305, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.842780351638794, "num_tokens": 496571263.0, "step": 13020 }, { "epoch": 1.656405037527032, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.5255241394043, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8679642677307129, "num_tokens": 496610058.0, "step": 13021 }, { "epoch": 1.6565322478056226, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.13650131225586, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8539366722106934, "num_tokens": 496640542.0, "step": 13022 }, { "epoch": 1.656659458084213, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.68790817260742, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.85942143201828, "num_tokens": 496678030.0, "step": 13023 }, { "epoch": 1.6567866683628036, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.109161376953125, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8573189377784729, "num_tokens": 496714057.0, "step": 13024 }, { "epoch": 1.6569138786413942, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.27277374267578, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8762359619140625, "num_tokens": 496744501.0, "step": 13025 }, { "epoch": 1.6570410889199847, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.73225784301758, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8645632266998291, "num_tokens": 496780233.0, "step": 13026 }, { "epoch": 1.6571682991985752, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.189083099365234, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8621392846107483, "num_tokens": 496809645.0, "step": 13027 }, { "epoch": 1.6572955094771658, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.33290481567383, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.86053466796875, "num_tokens": 496854871.0, "step": 13028 }, { "epoch": 1.6574227197557563, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.151771545410156, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.877871036529541, "num_tokens": 496891958.0, "step": 13029 }, { "epoch": 1.6575499300343468, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.022727966308594, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8536838293075562, "num_tokens": 496928550.0, "step": 13030 }, { "epoch": 1.6576771403129373, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.155391693115234, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8602735996246338, "num_tokens": 496972009.0, "step": 13031 }, { "epoch": 1.6578043505915279, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 41.803855895996094, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8579373359680176, "num_tokens": 497013605.0, "step": 13032 }, { "epoch": 1.6579315608701184, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.23467254638672, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8627507090568542, "num_tokens": 497048712.0, "step": 13033 }, { "epoch": 1.658058771148709, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.65284729003906, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8755236268043518, "num_tokens": 497089161.0, "step": 13034 }, { "epoch": 1.6581859814272994, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.34675979614258, "learning_rate": 1e-06, "loss": 0.6483, "mean_token_accuracy": 0.8318647146224976, "num_tokens": 497127253.0, "step": 13035 }, { "epoch": 1.65831319170589, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.21455764770508, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8569132089614868, "num_tokens": 497167637.0, "step": 13036 }, { "epoch": 1.6584404019844805, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.1445426940918, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8610307574272156, "num_tokens": 497205663.0, "step": 13037 }, { "epoch": 1.6585676122630708, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.264556884765625, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8768484592437744, "num_tokens": 497242146.0, "step": 13038 }, { "epoch": 1.6586948225416613, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.28215026855469, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8671340346336365, "num_tokens": 497279339.0, "step": 13039 }, { "epoch": 1.6588220328202519, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.01378631591797, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8524065017700195, "num_tokens": 497318530.0, "step": 13040 }, { "epoch": 1.6589492430988424, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.161231994628906, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8533445596694946, "num_tokens": 497352943.0, "step": 13041 }, { "epoch": 1.659076453377433, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.07530975341797, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8546457290649414, "num_tokens": 497387729.0, "step": 13042 }, { "epoch": 1.6592036636560235, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.41482162475586, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8626722693443298, "num_tokens": 497427875.0, "step": 13043 }, { "epoch": 1.6593308739346138, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.261478424072266, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8528755307197571, "num_tokens": 497469076.0, "step": 13044 }, { "epoch": 1.6594580842132043, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 41.984771728515625, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8624193668365479, "num_tokens": 497509556.0, "step": 13045 }, { "epoch": 1.6595852944917948, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.509422302246094, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8518147468566895, "num_tokens": 497548151.0, "step": 13046 }, { "epoch": 1.6597125047703853, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.6851921081543, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8343949317932129, "num_tokens": 497589088.0, "step": 13047 }, { "epoch": 1.6598397150489759, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.876197814941406, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8707494139671326, "num_tokens": 497632095.0, "step": 13048 }, { "epoch": 1.6599669253275664, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.95211410522461, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8732954263687134, "num_tokens": 497665148.0, "step": 13049 }, { "epoch": 1.660094135606157, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.58921432495117, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8548237085342407, "num_tokens": 497700134.0, "step": 13050 }, { "epoch": 1.6602213458847475, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.258548736572266, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8659486174583435, "num_tokens": 497743101.0, "step": 13051 }, { "epoch": 1.660348556163338, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.55217361450195, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8527315855026245, "num_tokens": 497775112.0, "step": 13052 }, { "epoch": 1.6604757664419285, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.24222183227539, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8717252016067505, "num_tokens": 497813320.0, "step": 13053 }, { "epoch": 1.660602976720519, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.21888732910156, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8558571934700012, "num_tokens": 497852780.0, "step": 13054 }, { "epoch": 1.6607301869991096, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.305049896240234, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8627990484237671, "num_tokens": 497886313.0, "step": 13055 }, { "epoch": 1.6608573972777, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.04072570800781, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8592752814292908, "num_tokens": 497924108.0, "step": 13056 }, { "epoch": 1.6609846075562906, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.648399353027344, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8485696315765381, "num_tokens": 497964656.0, "step": 13057 }, { "epoch": 1.6611118178348812, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.62059783935547, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.845522403717041, "num_tokens": 498005436.0, "step": 13058 }, { "epoch": 1.6612390281134717, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.635841369628906, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8570863008499146, "num_tokens": 498042237.0, "step": 13059 }, { "epoch": 1.6613662383920622, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.03974151611328, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8589664697647095, "num_tokens": 498083008.0, "step": 13060 }, { "epoch": 1.6614934486706527, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.56523895263672, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8370112180709839, "num_tokens": 498120550.0, "step": 13061 }, { "epoch": 1.661620658949243, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.59502410888672, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8564637899398804, "num_tokens": 498153033.0, "step": 13062 }, { "epoch": 1.6617478692278336, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.5115852355957, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8622304797172546, "num_tokens": 498190491.0, "step": 13063 }, { "epoch": 1.661875079506424, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 41.994083404541016, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8534419536590576, "num_tokens": 498235889.0, "step": 13064 }, { "epoch": 1.6620022897850146, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.30834197998047, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.857984721660614, "num_tokens": 498277191.0, "step": 13065 }, { "epoch": 1.6621295000636052, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.30787658691406, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8480804562568665, "num_tokens": 498318443.0, "step": 13066 }, { "epoch": 1.6622567103421957, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.345035552978516, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8464260101318359, "num_tokens": 498355955.0, "step": 13067 }, { "epoch": 1.662383920620786, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.3734130859375, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8615421056747437, "num_tokens": 498388588.0, "step": 13068 }, { "epoch": 1.6625111308993765, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.14662551879883, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8589463233947754, "num_tokens": 498430661.0, "step": 13069 }, { "epoch": 1.662638341177967, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.374603271484375, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8484498262405396, "num_tokens": 498470428.0, "step": 13070 }, { "epoch": 1.6627655514565576, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.03139877319336, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.859275221824646, "num_tokens": 498512031.0, "step": 13071 }, { "epoch": 1.662892761735148, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.763816833496094, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8838974833488464, "num_tokens": 498548344.0, "step": 13072 }, { "epoch": 1.6630199720137386, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.189537048339844, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.874228835105896, "num_tokens": 498590115.0, "step": 13073 }, { "epoch": 1.6631471822923292, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.961883544921875, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8602479100227356, "num_tokens": 498634034.0, "step": 13074 }, { "epoch": 1.6632743925709197, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.06893539428711, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8528468012809753, "num_tokens": 498675086.0, "step": 13075 }, { "epoch": 1.6634016028495102, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.27397537231445, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8667083382606506, "num_tokens": 498711374.0, "step": 13076 }, { "epoch": 1.6635288131281007, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.04680633544922, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.866119921207428, "num_tokens": 498753050.0, "step": 13077 }, { "epoch": 1.6636560234066913, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.14560317993164, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8621969223022461, "num_tokens": 498790406.0, "step": 13078 }, { "epoch": 1.6637832336852818, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.21574020385742, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8599840402603149, "num_tokens": 498827712.0, "step": 13079 }, { "epoch": 1.6639104439638723, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.3048095703125, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8656032681465149, "num_tokens": 498862283.0, "step": 13080 }, { "epoch": 1.6640376542424629, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.75990676879883, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.860399603843689, "num_tokens": 498902686.0, "step": 13081 }, { "epoch": 1.6641648645210534, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.3822021484375, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8491330146789551, "num_tokens": 498942785.0, "step": 13082 }, { "epoch": 1.664292074799644, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.282249450683594, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8666103482246399, "num_tokens": 498982761.0, "step": 13083 }, { "epoch": 1.6644192850782344, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.65485763549805, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8417782783508301, "num_tokens": 499025762.0, "step": 13084 }, { "epoch": 1.664546495356825, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.80019760131836, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.851120114326477, "num_tokens": 499063394.0, "step": 13085 }, { "epoch": 1.6646737056354155, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.36263656616211, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8718133568763733, "num_tokens": 499105824.0, "step": 13086 }, { "epoch": 1.6648009159140058, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.70170593261719, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8670535087585449, "num_tokens": 499140935.0, "step": 13087 }, { "epoch": 1.6649281261925963, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.528663635253906, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8598953485488892, "num_tokens": 499179492.0, "step": 13088 }, { "epoch": 1.6650553364711869, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.85565948486328, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8694120645523071, "num_tokens": 499214716.0, "step": 13089 }, { "epoch": 1.6651825467497774, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.72401809692383, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8670292496681213, "num_tokens": 499250337.0, "step": 13090 }, { "epoch": 1.665309757028368, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.70325469970703, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8651634454727173, "num_tokens": 499283675.0, "step": 13091 }, { "epoch": 1.6654369673069584, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.77088928222656, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8656913042068481, "num_tokens": 499319153.0, "step": 13092 }, { "epoch": 1.6655641775855488, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.900821685791016, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8581882119178772, "num_tokens": 499356675.0, "step": 13093 }, { "epoch": 1.6656913878641393, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15385055541992, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8761890530586243, "num_tokens": 499392129.0, "step": 13094 }, { "epoch": 1.6658185981427298, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.72653579711914, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8720309734344482, "num_tokens": 499424958.0, "step": 13095 }, { "epoch": 1.6659458084213203, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.742462158203125, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8697600960731506, "num_tokens": 499467769.0, "step": 13096 }, { "epoch": 1.6660730186999109, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.82355499267578, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8502850532531738, "num_tokens": 499513828.0, "step": 13097 }, { "epoch": 1.6662002289785014, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.644100189208984, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8651691675186157, "num_tokens": 499548219.0, "step": 13098 }, { "epoch": 1.666327439257092, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.741546630859375, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8448760509490967, "num_tokens": 499587158.0, "step": 13099 }, { "epoch": 1.6664546495356825, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.806915283203125, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8630360960960388, "num_tokens": 499625235.0, "step": 13100 }, { "epoch": 1.666581859814273, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.871986389160156, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8410226106643677, "num_tokens": 499662842.0, "step": 13101 }, { "epoch": 1.6667090700928635, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.481136322021484, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8381531834602356, "num_tokens": 499703464.0, "step": 13102 }, { "epoch": 1.666836280371454, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.543338775634766, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8609259128570557, "num_tokens": 499739160.0, "step": 13103 }, { "epoch": 1.6669634906500446, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.869895935058594, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8467631340026855, "num_tokens": 499775024.0, "step": 13104 }, { "epoch": 1.667090700928635, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.8149299621582, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8634109497070312, "num_tokens": 499816479.0, "step": 13105 }, { "epoch": 1.6672179112072256, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.892601013183594, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.864513635635376, "num_tokens": 499848790.0, "step": 13106 }, { "epoch": 1.6673451214858162, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.6567268371582, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8807544112205505, "num_tokens": 499890955.0, "step": 13107 }, { "epoch": 1.6674723317644067, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.08047866821289, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8696560859680176, "num_tokens": 499924813.0, "step": 13108 }, { "epoch": 1.6675995420429972, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.888858795166016, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8581827878952026, "num_tokens": 499960032.0, "step": 13109 }, { "epoch": 1.6677267523215877, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.876468658447266, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8741514682769775, "num_tokens": 499995868.0, "step": 13110 }, { "epoch": 1.667853962600178, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.73701858520508, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8646949529647827, "num_tokens": 500031662.0, "step": 13111 }, { "epoch": 1.6679811728787686, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.635807037353516, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8518364429473877, "num_tokens": 500070189.0, "step": 13112 }, { "epoch": 1.668108383157359, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.55062484741211, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8575665950775146, "num_tokens": 500109999.0, "step": 13113 }, { "epoch": 1.6682355934359496, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.33847427368164, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8427603244781494, "num_tokens": 500151473.0, "step": 13114 }, { "epoch": 1.6683628037145402, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.1235237121582, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8588361740112305, "num_tokens": 500187436.0, "step": 13115 }, { "epoch": 1.6684900139931307, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.4833869934082, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8604863882064819, "num_tokens": 500222433.0, "step": 13116 }, { "epoch": 1.668617224271721, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 44.004947662353516, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.861863911151886, "num_tokens": 500257390.0, "step": 13117 }, { "epoch": 1.6687444345503115, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.19029998779297, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8617969751358032, "num_tokens": 500301260.0, "step": 13118 }, { "epoch": 1.668871644828902, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.467769622802734, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8665958046913147, "num_tokens": 500343453.0, "step": 13119 }, { "epoch": 1.6689988551074926, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.88618087768555, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8669444918632507, "num_tokens": 500375133.0, "step": 13120 }, { "epoch": 1.669126065386083, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.75162887573242, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8550328612327576, "num_tokens": 500408540.0, "step": 13121 }, { "epoch": 1.6692532756646736, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.34269714355469, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.874374270439148, "num_tokens": 500446243.0, "step": 13122 }, { "epoch": 1.6693804859432642, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.42939376831055, "learning_rate": 1e-06, "loss": 0.6452, "mean_token_accuracy": 0.8375087380409241, "num_tokens": 500484313.0, "step": 13123 }, { "epoch": 1.6695076962218547, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.78624725341797, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8730834126472473, "num_tokens": 500525566.0, "step": 13124 }, { "epoch": 1.6696349065004452, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.079280853271484, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8657163381576538, "num_tokens": 500566274.0, "step": 13125 }, { "epoch": 1.6697621167790357, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.52662658691406, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8528386354446411, "num_tokens": 500602769.0, "step": 13126 }, { "epoch": 1.6698893270576263, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.08821105957031, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8450621962547302, "num_tokens": 500642895.0, "step": 13127 }, { "epoch": 1.6700165373362168, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.135250091552734, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8585786819458008, "num_tokens": 500681461.0, "step": 13128 }, { "epoch": 1.6701437476148073, "ewc_loss": 0.14453125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012302398681640625, "grad_norm": 42.19948196411133, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8448883295059204, "num_tokens": 500723395.0, "step": 13129 }, { "epoch": 1.6702709578933979, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.491703033447266, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8621396422386169, "num_tokens": 500760571.0, "step": 13130 }, { "epoch": 1.6703981681719884, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.407470703125, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8498425483703613, "num_tokens": 500797858.0, "step": 13131 }, { "epoch": 1.670525378450579, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.510799407958984, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8661199808120728, "num_tokens": 500840199.0, "step": 13132 }, { "epoch": 1.6706525887291694, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.657596588134766, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8548767566680908, "num_tokens": 500883290.0, "step": 13133 }, { "epoch": 1.67077979900776, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.245758056640625, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.865008533000946, "num_tokens": 500923072.0, "step": 13134 }, { "epoch": 1.6709070092863505, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.71630096435547, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8630218505859375, "num_tokens": 500958726.0, "step": 13135 }, { "epoch": 1.6710342195649408, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.61509323120117, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8559752106666565, "num_tokens": 500996746.0, "step": 13136 }, { "epoch": 1.6711614298435313, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.703182220458984, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8524830341339111, "num_tokens": 501034135.0, "step": 13137 }, { "epoch": 1.6712886401221219, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.2829475402832, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8623870611190796, "num_tokens": 501069810.0, "step": 13138 }, { "epoch": 1.6714158504007124, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.10953140258789, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8555301427841187, "num_tokens": 501110597.0, "step": 13139 }, { "epoch": 1.671543060679303, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.98930740356445, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.848307192325592, "num_tokens": 501145927.0, "step": 13140 }, { "epoch": 1.6716702709578934, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.76213455200195, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8406562805175781, "num_tokens": 501188473.0, "step": 13141 }, { "epoch": 1.6717974812364838, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.32068634033203, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.873036801815033, "num_tokens": 501226143.0, "step": 13142 }, { "epoch": 1.6719246915150743, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.736412048339844, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8510032892227173, "num_tokens": 501266980.0, "step": 13143 }, { "epoch": 1.6720519017936648, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.642677307128906, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8575222492218018, "num_tokens": 501305414.0, "step": 13144 }, { "epoch": 1.6721791120722553, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.4619140625, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8525110483169556, "num_tokens": 501343009.0, "step": 13145 }, { "epoch": 1.6723063223508459, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.6978645324707, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8674458861351013, "num_tokens": 501383571.0, "step": 13146 }, { "epoch": 1.6724335326294364, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.146053314208984, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8626540899276733, "num_tokens": 501425444.0, "step": 13147 }, { "epoch": 1.672560742908027, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.02555465698242, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8736792802810669, "num_tokens": 501466536.0, "step": 13148 }, { "epoch": 1.6726879531866174, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.09370040893555, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.863253116607666, "num_tokens": 501507346.0, "step": 13149 }, { "epoch": 1.672815163465208, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.5671272277832, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.867256760597229, "num_tokens": 501545515.0, "step": 13150 }, { "epoch": 1.6729423737437985, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.79030990600586, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8546191453933716, "num_tokens": 501588793.0, "step": 13151 }, { "epoch": 1.673069584022389, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.312862396240234, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8571014404296875, "num_tokens": 501624384.0, "step": 13152 }, { "epoch": 1.6731967943009796, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.63700485229492, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8594121932983398, "num_tokens": 501663150.0, "step": 13153 }, { "epoch": 1.67332400457957, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.98567199707031, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8544254302978516, "num_tokens": 501706177.0, "step": 13154 }, { "epoch": 1.6734512148581606, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.95277404785156, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8712285757064819, "num_tokens": 501745267.0, "step": 13155 }, { "epoch": 1.6735784251367511, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.871578216552734, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8547135591506958, "num_tokens": 501783591.0, "step": 13156 }, { "epoch": 1.6737056354153417, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.986270904541016, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8507821559906006, "num_tokens": 501825190.0, "step": 13157 }, { "epoch": 1.6738328456939322, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.95509338378906, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8599386811256409, "num_tokens": 501855813.0, "step": 13158 }, { "epoch": 1.6739600559725227, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.988040924072266, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8579137325286865, "num_tokens": 501890106.0, "step": 13159 }, { "epoch": 1.674087266251113, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 41.753658294677734, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8450934886932373, "num_tokens": 501935437.0, "step": 13160 }, { "epoch": 1.6742144765297036, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.87244415283203, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8489724397659302, "num_tokens": 501980134.0, "step": 13161 }, { "epoch": 1.674341686808294, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.16280746459961, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.854325532913208, "num_tokens": 502015977.0, "step": 13162 }, { "epoch": 1.6744688970868846, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.764129638671875, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8620943427085876, "num_tokens": 502061717.0, "step": 13163 }, { "epoch": 1.6745961073654752, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.30644226074219, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8578615188598633, "num_tokens": 502101594.0, "step": 13164 }, { "epoch": 1.6747233176440657, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.57568359375, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8591693639755249, "num_tokens": 502136521.0, "step": 13165 }, { "epoch": 1.674850527922656, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.350765228271484, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8402158617973328, "num_tokens": 502174821.0, "step": 13166 }, { "epoch": 1.6749777382012465, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.542049407958984, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8534672856330872, "num_tokens": 502210615.0, "step": 13167 }, { "epoch": 1.675104948479837, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.59844970703125, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8607237935066223, "num_tokens": 502248220.0, "step": 13168 }, { "epoch": 1.6752321587584276, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.37350082397461, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8587673902511597, "num_tokens": 502290686.0, "step": 13169 }, { "epoch": 1.675359369037018, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.68907928466797, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8478602170944214, "num_tokens": 502331789.0, "step": 13170 }, { "epoch": 1.6754865793156086, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.281593322753906, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8654979467391968, "num_tokens": 502371490.0, "step": 13171 }, { "epoch": 1.6756137895941992, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.6513671875, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8595549464225769, "num_tokens": 502415606.0, "step": 13172 }, { "epoch": 1.6757409998727897, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.148338317871094, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8590859770774841, "num_tokens": 502456203.0, "step": 13173 }, { "epoch": 1.6758682101513802, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.17837905883789, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8759191632270813, "num_tokens": 502491508.0, "step": 13174 }, { "epoch": 1.6759954204299707, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.132144927978516, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8482908606529236, "num_tokens": 502533408.0, "step": 13175 }, { "epoch": 1.6761226307085613, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.99607849121094, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8527828454971313, "num_tokens": 502569824.0, "step": 13176 }, { "epoch": 1.6762498409871518, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.44392013549805, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8682665824890137, "num_tokens": 502604776.0, "step": 13177 }, { "epoch": 1.6763770512657423, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.78402328491211, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8421929478645325, "num_tokens": 502645732.0, "step": 13178 }, { "epoch": 1.6765042615443329, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.559715270996094, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8626110553741455, "num_tokens": 502686044.0, "step": 13179 }, { "epoch": 1.6766314718229234, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.010040283203125, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8504779934883118, "num_tokens": 502728288.0, "step": 13180 }, { "epoch": 1.676758682101514, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.747371673583984, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8519768118858337, "num_tokens": 502760772.0, "step": 13181 }, { "epoch": 1.6768858923801044, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.546058654785156, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8596169948577881, "num_tokens": 502791585.0, "step": 13182 }, { "epoch": 1.677013102658695, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.949859619140625, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8770248889923096, "num_tokens": 502826339.0, "step": 13183 }, { "epoch": 1.6771403129372855, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.78175354003906, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8761699199676514, "num_tokens": 502868716.0, "step": 13184 }, { "epoch": 1.6772675232158758, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.7554817199707, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8583563566207886, "num_tokens": 502907992.0, "step": 13185 }, { "epoch": 1.6773947334944663, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.00749206542969, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8535301685333252, "num_tokens": 502947197.0, "step": 13186 }, { "epoch": 1.6775219437730569, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.958065032958984, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8698599338531494, "num_tokens": 502985890.0, "step": 13187 }, { "epoch": 1.6776491540516474, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.69447708129883, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8533594012260437, "num_tokens": 503033476.0, "step": 13188 }, { "epoch": 1.677776364330238, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.02065658569336, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8693203926086426, "num_tokens": 503068002.0, "step": 13189 }, { "epoch": 1.6779035746088284, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.49934768676758, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8527901768684387, "num_tokens": 503110059.0, "step": 13190 }, { "epoch": 1.6780307848874187, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.275611877441406, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8537299036979675, "num_tokens": 503146617.0, "step": 13191 }, { "epoch": 1.6781579951660093, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.22677993774414, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8566212058067322, "num_tokens": 503186436.0, "step": 13192 }, { "epoch": 1.6782852054445998, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.58218765258789, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8645873665809631, "num_tokens": 503227983.0, "step": 13193 }, { "epoch": 1.6784124157231903, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 41.88052749633789, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.847511887550354, "num_tokens": 503269301.0, "step": 13194 }, { "epoch": 1.6785396260017809, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.781436920166016, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.842499315738678, "num_tokens": 503308919.0, "step": 13195 }, { "epoch": 1.6786668362803714, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.15230941772461, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.869649350643158, "num_tokens": 503345030.0, "step": 13196 }, { "epoch": 1.678794046558962, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.546409606933594, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8519574403762817, "num_tokens": 503379142.0, "step": 13197 }, { "epoch": 1.6789212568375524, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.137550354003906, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8595159649848938, "num_tokens": 503411810.0, "step": 13198 }, { "epoch": 1.679048467116143, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.438438415527344, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8561930656433105, "num_tokens": 503450266.0, "step": 13199 }, { "epoch": 1.6791756773947335, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.363372802734375, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8670008182525635, "num_tokens": 503491015.0, "step": 13200 }, { "epoch": 1.679302887673324, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.121585845947266, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8677957057952881, "num_tokens": 503531964.0, "step": 13201 }, { "epoch": 1.6794300979519146, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.37070083618164, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8627705574035645, "num_tokens": 503576069.0, "step": 13202 }, { "epoch": 1.679557308230505, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.6595458984375, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8617377877235413, "num_tokens": 503607895.0, "step": 13203 }, { "epoch": 1.6796845185090956, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.35246658325195, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8542889952659607, "num_tokens": 503636891.0, "step": 13204 }, { "epoch": 1.6798117287876861, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.609962463378906, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8748800754547119, "num_tokens": 503682037.0, "step": 13205 }, { "epoch": 1.6799389390662767, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.69001770019531, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8612418174743652, "num_tokens": 503724008.0, "step": 13206 }, { "epoch": 1.6800661493448672, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.285926818847656, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8679017424583435, "num_tokens": 503759008.0, "step": 13207 }, { "epoch": 1.6801933596234577, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.613834381103516, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8561020493507385, "num_tokens": 503796599.0, "step": 13208 }, { "epoch": 1.680320569902048, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.78569412231445, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8572556972503662, "num_tokens": 503835539.0, "step": 13209 }, { "epoch": 1.6804477801806386, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.87504577636719, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8571555614471436, "num_tokens": 503875933.0, "step": 13210 }, { "epoch": 1.680574990459229, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.97188949584961, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8550711870193481, "num_tokens": 503913740.0, "step": 13211 }, { "epoch": 1.6807022007378196, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.6203727722168, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8624166250228882, "num_tokens": 503955105.0, "step": 13212 }, { "epoch": 1.6808294110164101, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.31125259399414, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8590472936630249, "num_tokens": 503987641.0, "step": 13213 }, { "epoch": 1.6809566212950007, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.26676940917969, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8518013954162598, "num_tokens": 504028699.0, "step": 13214 }, { "epoch": 1.681083831573591, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.00547790527344, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8541285991668701, "num_tokens": 504065597.0, "step": 13215 }, { "epoch": 1.6812110418521815, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.38663101196289, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8445647358894348, "num_tokens": 504105968.0, "step": 13216 }, { "epoch": 1.681338252130772, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.010860443115234, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8637977838516235, "num_tokens": 504143743.0, "step": 13217 }, { "epoch": 1.6814654624093626, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.598201751708984, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8631763458251953, "num_tokens": 504177023.0, "step": 13218 }, { "epoch": 1.681592672687953, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.71305465698242, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.857280969619751, "num_tokens": 504210492.0, "step": 13219 }, { "epoch": 1.6817198829665436, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.94263458251953, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8591055870056152, "num_tokens": 504248322.0, "step": 13220 }, { "epoch": 1.6818470932451342, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.21504592895508, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8463279008865356, "num_tokens": 504285915.0, "step": 13221 }, { "epoch": 1.6819743035237247, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.441158294677734, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8574126362800598, "num_tokens": 504325578.0, "step": 13222 }, { "epoch": 1.6821015138023152, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.31787109375, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8800557851791382, "num_tokens": 504362876.0, "step": 13223 }, { "epoch": 1.6822287240809057, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.219276428222656, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8573421239852905, "num_tokens": 504402303.0, "step": 13224 }, { "epoch": 1.6823559343594963, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.700565338134766, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8589645624160767, "num_tokens": 504440039.0, "step": 13225 }, { "epoch": 1.6824831446380868, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.16261672973633, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8497068881988525, "num_tokens": 504485102.0, "step": 13226 }, { "epoch": 1.6826103549166773, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.47722625732422, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8580889701843262, "num_tokens": 504522762.0, "step": 13227 }, { "epoch": 1.6827375651952678, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.488277435302734, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8641499280929565, "num_tokens": 504562818.0, "step": 13228 }, { "epoch": 1.6828647754738584, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.446529388427734, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8505967259407043, "num_tokens": 504602914.0, "step": 13229 }, { "epoch": 1.682991985752449, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.90010452270508, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.866559624671936, "num_tokens": 504641376.0, "step": 13230 }, { "epoch": 1.6831191960310394, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.16444396972656, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8666427135467529, "num_tokens": 504675031.0, "step": 13231 }, { "epoch": 1.68324640630963, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.37855911254883, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8653640151023865, "num_tokens": 504709624.0, "step": 13232 }, { "epoch": 1.6833736165882205, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.0759391784668, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8625430464744568, "num_tokens": 504749310.0, "step": 13233 }, { "epoch": 1.6835008268668108, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.15388107299805, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8620401620864868, "num_tokens": 504791700.0, "step": 13234 }, { "epoch": 1.6836280371454013, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.9369010925293, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8489757776260376, "num_tokens": 504833246.0, "step": 13235 }, { "epoch": 1.6837552474239919, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.88990783691406, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8635315895080566, "num_tokens": 504872901.0, "step": 13236 }, { "epoch": 1.6838824577025824, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.55902862548828, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8525938391685486, "num_tokens": 504906138.0, "step": 13237 }, { "epoch": 1.684009667981173, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.93318557739258, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8394739627838135, "num_tokens": 504947147.0, "step": 13238 }, { "epoch": 1.6841368782597634, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.15172576904297, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8604776859283447, "num_tokens": 504985332.0, "step": 13239 }, { "epoch": 1.6842640885383537, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.05707550048828, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8511711359024048, "num_tokens": 505027995.0, "step": 13240 }, { "epoch": 1.6843912988169443, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.7374153137207, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.86217200756073, "num_tokens": 505065311.0, "step": 13241 }, { "epoch": 1.6845185090955348, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.05536651611328, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8555828928947449, "num_tokens": 505105810.0, "step": 13242 }, { "epoch": 1.6846457193741253, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.53633117675781, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8670535087585449, "num_tokens": 505141003.0, "step": 13243 }, { "epoch": 1.6847729296527159, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.059696197509766, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8492312431335449, "num_tokens": 505188025.0, "step": 13244 }, { "epoch": 1.6849001399313064, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.80925750732422, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8725857734680176, "num_tokens": 505226101.0, "step": 13245 }, { "epoch": 1.685027350209897, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.4436149597168, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8502929210662842, "num_tokens": 505264880.0, "step": 13246 }, { "epoch": 1.6851545604884874, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.90081787109375, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8523171544075012, "num_tokens": 505306208.0, "step": 13247 }, { "epoch": 1.685281770767078, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.81464767456055, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8823166489601135, "num_tokens": 505346024.0, "step": 13248 }, { "epoch": 1.6854089810456685, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.892887115478516, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8824881315231323, "num_tokens": 505383644.0, "step": 13249 }, { "epoch": 1.685536191324259, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.568687438964844, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8632085919380188, "num_tokens": 505420365.0, "step": 13250 }, { "epoch": 1.6856634016028496, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.575191497802734, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8598487973213196, "num_tokens": 505458211.0, "step": 13251 }, { "epoch": 1.68579061188144, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.748313903808594, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8541558980941772, "num_tokens": 505501649.0, "step": 13252 }, { "epoch": 1.6859178221600306, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.423240661621094, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.862214207649231, "num_tokens": 505538763.0, "step": 13253 }, { "epoch": 1.6860450324386211, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.026912689208984, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8818868398666382, "num_tokens": 505574562.0, "step": 13254 }, { "epoch": 1.6861722427172117, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.837890625, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8746423125267029, "num_tokens": 505609718.0, "step": 13255 }, { "epoch": 1.6862994529958022, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.05940246582031, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8743205666542053, "num_tokens": 505644337.0, "step": 13256 }, { "epoch": 1.6864266632743927, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.70409393310547, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8708094954490662, "num_tokens": 505681929.0, "step": 13257 }, { "epoch": 1.686553873552983, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.75221633911133, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.8369665741920471, "num_tokens": 505719373.0, "step": 13258 }, { "epoch": 1.6866810838315736, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.024803161621094, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8453487157821655, "num_tokens": 505757767.0, "step": 13259 }, { "epoch": 1.686808294110164, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.89361572265625, "learning_rate": 1e-06, "loss": 0.691, "mean_token_accuracy": 0.8303483128547668, "num_tokens": 505794777.0, "step": 13260 }, { "epoch": 1.6869355043887546, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.120330810546875, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8671734929084778, "num_tokens": 505832672.0, "step": 13261 }, { "epoch": 1.6870627146673451, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.860260009765625, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8485252857208252, "num_tokens": 505873142.0, "step": 13262 }, { "epoch": 1.6871899249459357, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.919776916503906, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8633931279182434, "num_tokens": 505911994.0, "step": 13263 }, { "epoch": 1.687317135224526, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.00328826904297, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8645898103713989, "num_tokens": 505948874.0, "step": 13264 }, { "epoch": 1.6874443455031165, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.9884033203125, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8661614060401917, "num_tokens": 505989679.0, "step": 13265 }, { "epoch": 1.687571555781707, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.10706329345703, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8681446313858032, "num_tokens": 506033209.0, "step": 13266 }, { "epoch": 1.6876987660602976, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.24318313598633, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.8684962391853333, "num_tokens": 506065379.0, "step": 13267 }, { "epoch": 1.687825976338888, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.59234619140625, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8516828417778015, "num_tokens": 506099484.0, "step": 13268 }, { "epoch": 1.6879531866174786, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.93928909301758, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8450757265090942, "num_tokens": 506136559.0, "step": 13269 }, { "epoch": 1.6880803968960691, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.89753723144531, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.858150839805603, "num_tokens": 506172997.0, "step": 13270 }, { "epoch": 1.6882076071746597, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.7016716003418, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8739256858825684, "num_tokens": 506209858.0, "step": 13271 }, { "epoch": 1.6883348174532502, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.03944778442383, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8720270395278931, "num_tokens": 506242260.0, "step": 13272 }, { "epoch": 1.6884620277318407, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.32378387451172, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8468221426010132, "num_tokens": 506276856.0, "step": 13273 }, { "epoch": 1.6885892380104313, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.43901443481445, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8792517781257629, "num_tokens": 506311469.0, "step": 13274 }, { "epoch": 1.6887164482890218, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.658023834228516, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8696750402450562, "num_tokens": 506348240.0, "step": 13275 }, { "epoch": 1.6888436585676123, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.105838775634766, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.863144040107727, "num_tokens": 506387203.0, "step": 13276 }, { "epoch": 1.6889708688462028, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.41533660888672, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8364591002464294, "num_tokens": 506421869.0, "step": 13277 }, { "epoch": 1.6890980791247934, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.3306999206543, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8596511483192444, "num_tokens": 506458873.0, "step": 13278 }, { "epoch": 1.689225289403384, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.28336715698242, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8470619320869446, "num_tokens": 506495947.0, "step": 13279 }, { "epoch": 1.6893524996819744, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.813804626464844, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8419546484947205, "num_tokens": 506533098.0, "step": 13280 }, { "epoch": 1.689479709960565, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.24171447753906, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8716439008712769, "num_tokens": 506571346.0, "step": 13281 }, { "epoch": 1.6896069202391555, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.41022491455078, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8626723289489746, "num_tokens": 506607912.0, "step": 13282 }, { "epoch": 1.6897341305177458, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.930213928222656, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8499171733856201, "num_tokens": 506654258.0, "step": 13283 }, { "epoch": 1.6898613407963363, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.03062057495117, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8669902086257935, "num_tokens": 506693436.0, "step": 13284 }, { "epoch": 1.6899885510749268, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.461402893066406, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8635017275810242, "num_tokens": 506737383.0, "step": 13285 }, { "epoch": 1.6901157613535174, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.93449401855469, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8691881895065308, "num_tokens": 506772282.0, "step": 13286 }, { "epoch": 1.690242971632108, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.99538803100586, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8709951639175415, "num_tokens": 506807208.0, "step": 13287 }, { "epoch": 1.6903701819106984, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.183860778808594, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8680968880653381, "num_tokens": 506847952.0, "step": 13288 }, { "epoch": 1.6904973921892887, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.023841857910156, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8628557324409485, "num_tokens": 506888265.0, "step": 13289 }, { "epoch": 1.6906246024678793, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.873008728027344, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.863861083984375, "num_tokens": 506926096.0, "step": 13290 }, { "epoch": 1.6907518127464698, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.60890579223633, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8525707125663757, "num_tokens": 506962252.0, "step": 13291 }, { "epoch": 1.6908790230250603, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.589263916015625, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8653077483177185, "num_tokens": 507004709.0, "step": 13292 }, { "epoch": 1.6910062333036509, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.98733901977539, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8769429922103882, "num_tokens": 507041041.0, "step": 13293 }, { "epoch": 1.6911334435822414, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.870140075683594, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8535747528076172, "num_tokens": 507080251.0, "step": 13294 }, { "epoch": 1.691260653860832, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.2218132019043, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8601607084274292, "num_tokens": 507120813.0, "step": 13295 }, { "epoch": 1.6913878641394224, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.72000503540039, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.847388744354248, "num_tokens": 507160284.0, "step": 13296 }, { "epoch": 1.691515074418013, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.23222351074219, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8520694971084595, "num_tokens": 507205069.0, "step": 13297 }, { "epoch": 1.6916422846966035, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.700687408447266, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8692039251327515, "num_tokens": 507244689.0, "step": 13298 }, { "epoch": 1.691769494975194, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.27871322631836, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.860302746295929, "num_tokens": 507283917.0, "step": 13299 }, { "epoch": 1.6918967052537845, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.08255386352539, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8617568016052246, "num_tokens": 507327937.0, "step": 13300 }, { "epoch": 1.692023915532375, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.5260124206543, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8737795352935791, "num_tokens": 507369521.0, "step": 13301 }, { "epoch": 1.6921511258109656, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.90612030029297, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8702812790870667, "num_tokens": 507408157.0, "step": 13302 }, { "epoch": 1.6922783360895561, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.15696334838867, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8663296699523926, "num_tokens": 507448003.0, "step": 13303 }, { "epoch": 1.6924055463681467, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.82014465332031, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8692485690116882, "num_tokens": 507489970.0, "step": 13304 }, { "epoch": 1.6925327566467372, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.90261459350586, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8634235858917236, "num_tokens": 507524536.0, "step": 13305 }, { "epoch": 1.6926599669253277, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.98416519165039, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8828471302986145, "num_tokens": 507561575.0, "step": 13306 }, { "epoch": 1.692787177203918, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.7645149230957, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8621864318847656, "num_tokens": 507597212.0, "step": 13307 }, { "epoch": 1.6929143874825086, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.03878402709961, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8639523386955261, "num_tokens": 507634251.0, "step": 13308 }, { "epoch": 1.693041597761099, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.04814529418945, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8665335774421692, "num_tokens": 507671345.0, "step": 13309 }, { "epoch": 1.6931688080396896, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.24881362915039, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8499385714530945, "num_tokens": 507709797.0, "step": 13310 }, { "epoch": 1.6932960183182801, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.792205810546875, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.861778199672699, "num_tokens": 507747047.0, "step": 13311 }, { "epoch": 1.6934232285968707, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.1770133972168, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.861752450466156, "num_tokens": 507782187.0, "step": 13312 }, { "epoch": 1.693550438875461, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.97186279296875, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8471852540969849, "num_tokens": 507826714.0, "step": 13313 }, { "epoch": 1.6936776491540515, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.984130859375, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8516921997070312, "num_tokens": 507869423.0, "step": 13314 }, { "epoch": 1.693804859432642, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.37136459350586, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8705993890762329, "num_tokens": 507904711.0, "step": 13315 }, { "epoch": 1.6939320697112326, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.68668746948242, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8561210632324219, "num_tokens": 507947760.0, "step": 13316 }, { "epoch": 1.694059279989823, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.975547790527344, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8581914901733398, "num_tokens": 507984249.0, "step": 13317 }, { "epoch": 1.6941864902684136, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.227699279785156, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8589953184127808, "num_tokens": 508017206.0, "step": 13318 }, { "epoch": 1.6943137005470041, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.580467224121094, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8545403480529785, "num_tokens": 508056357.0, "step": 13319 }, { "epoch": 1.6944409108255947, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.82754898071289, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8534471988677979, "num_tokens": 508094870.0, "step": 13320 }, { "epoch": 1.6945681211041852, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.85685348510742, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8600198030471802, "num_tokens": 508120323.0, "step": 13321 }, { "epoch": 1.6946953313827757, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.69777297973633, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8525524735450745, "num_tokens": 508160018.0, "step": 13322 }, { "epoch": 1.6948225416613663, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.30230712890625, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8790918588638306, "num_tokens": 508200947.0, "step": 13323 }, { "epoch": 1.6949497519399568, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.75678253173828, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8475528359413147, "num_tokens": 508246457.0, "step": 13324 }, { "epoch": 1.6950769622185473, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.718936920166016, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8552849292755127, "num_tokens": 508281951.0, "step": 13325 }, { "epoch": 1.6952041724971378, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.07594680786133, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8424952626228333, "num_tokens": 508322571.0, "step": 13326 }, { "epoch": 1.6953313827757284, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.56867980957031, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8672423362731934, "num_tokens": 508357347.0, "step": 13327 }, { "epoch": 1.695458593054319, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.69485855102539, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8638631701469421, "num_tokens": 508401813.0, "step": 13328 }, { "epoch": 1.6955858033329094, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.99575424194336, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8491714596748352, "num_tokens": 508440457.0, "step": 13329 }, { "epoch": 1.6957130136115, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.395267486572266, "learning_rate": 1e-06, "loss": 0.5842, "mean_token_accuracy": 0.8526955842971802, "num_tokens": 508472272.0, "step": 13330 }, { "epoch": 1.6958402238900905, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.094207763671875, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8728209137916565, "num_tokens": 508509352.0, "step": 13331 }, { "epoch": 1.6959674341686808, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.58738327026367, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8692004680633545, "num_tokens": 508552515.0, "step": 13332 }, { "epoch": 1.6960946444472713, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.9620475769043, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8718284368515015, "num_tokens": 508591399.0, "step": 13333 }, { "epoch": 1.6962218547258618, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.80093765258789, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.865240216255188, "num_tokens": 508630427.0, "step": 13334 }, { "epoch": 1.6963490650044524, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.34596252441406, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8688690066337585, "num_tokens": 508667469.0, "step": 13335 }, { "epoch": 1.696476275283043, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.78403091430664, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8643837571144104, "num_tokens": 508706308.0, "step": 13336 }, { "epoch": 1.6966034855616334, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.997623443603516, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8487226963043213, "num_tokens": 508738968.0, "step": 13337 }, { "epoch": 1.6967306958402237, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.00105285644531, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8680144548416138, "num_tokens": 508769879.0, "step": 13338 }, { "epoch": 1.6968579061188143, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.92339324951172, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.845806360244751, "num_tokens": 508812277.0, "step": 13339 }, { "epoch": 1.6969851163974048, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.067771911621094, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8506283164024353, "num_tokens": 508849293.0, "step": 13340 }, { "epoch": 1.6971123266759953, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.71096420288086, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8628751039505005, "num_tokens": 508885798.0, "step": 13341 }, { "epoch": 1.6972395369545858, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.94866943359375, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8584787845611572, "num_tokens": 508920491.0, "step": 13342 }, { "epoch": 1.6973667472331764, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.87318420410156, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8599903583526611, "num_tokens": 508956161.0, "step": 13343 }, { "epoch": 1.697493957511767, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.980709075927734, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8572951555252075, "num_tokens": 508995696.0, "step": 13344 }, { "epoch": 1.6976211677903574, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.8180046081543, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.865416407585144, "num_tokens": 509029505.0, "step": 13345 }, { "epoch": 1.697748378068948, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.989837646484375, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8671557903289795, "num_tokens": 509062815.0, "step": 13346 }, { "epoch": 1.6978755883475385, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.25799560546875, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8674628734588623, "num_tokens": 509103376.0, "step": 13347 }, { "epoch": 1.698002798626129, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.689640045166016, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.870302140712738, "num_tokens": 509142797.0, "step": 13348 }, { "epoch": 1.6981300089047195, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.04244613647461, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8497302532196045, "num_tokens": 509182724.0, "step": 13349 }, { "epoch": 1.69825721918331, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.88771438598633, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8690447807312012, "num_tokens": 509217856.0, "step": 13350 }, { "epoch": 1.6983844294619006, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.118465423583984, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8616419434547424, "num_tokens": 509251581.0, "step": 13351 }, { "epoch": 1.6985116397404911, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.069786071777344, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.87757408618927, "num_tokens": 509286687.0, "step": 13352 }, { "epoch": 1.6986388500190817, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.9141845703125, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8559876680374146, "num_tokens": 509323372.0, "step": 13353 }, { "epoch": 1.6987660602976722, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.4456901550293, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8536967635154724, "num_tokens": 509362472.0, "step": 13354 }, { "epoch": 1.6988932705762627, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.695438385009766, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8619523048400879, "num_tokens": 509397676.0, "step": 13355 }, { "epoch": 1.699020480854853, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.2796745300293, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8646494746208191, "num_tokens": 509441100.0, "step": 13356 }, { "epoch": 1.6991476911334435, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.941062927246094, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8561897873878479, "num_tokens": 509485430.0, "step": 13357 }, { "epoch": 1.699274901412034, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.207618713378906, "learning_rate": 1e-06, "loss": 0.6633, "mean_token_accuracy": 0.8310017585754395, "num_tokens": 509520102.0, "step": 13358 }, { "epoch": 1.6994021116906246, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.52164077758789, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8617565631866455, "num_tokens": 509556525.0, "step": 13359 }, { "epoch": 1.6995293219692151, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.122840881347656, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8526270389556885, "num_tokens": 509596899.0, "step": 13360 }, { "epoch": 1.6996565322478057, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.52504348754883, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8627264499664307, "num_tokens": 509632125.0, "step": 13361 }, { "epoch": 1.699783742526396, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.16292953491211, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8558999300003052, "num_tokens": 509668131.0, "step": 13362 }, { "epoch": 1.6999109528049865, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.27093505859375, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8736424446105957, "num_tokens": 509704078.0, "step": 13363 }, { "epoch": 1.700038163083577, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.774658203125, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8738426566123962, "num_tokens": 509739464.0, "step": 13364 }, { "epoch": 1.7001653733621676, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.94672775268555, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8616229295730591, "num_tokens": 509777970.0, "step": 13365 }, { "epoch": 1.700292583640758, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.07294464111328, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8477911949157715, "num_tokens": 509817010.0, "step": 13366 }, { "epoch": 1.7004197939193486, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.7193489074707, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8662196397781372, "num_tokens": 509854192.0, "step": 13367 }, { "epoch": 1.7005470041979391, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.053646087646484, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8739129304885864, "num_tokens": 509888633.0, "step": 13368 }, { "epoch": 1.7006742144765297, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.526241302490234, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8624028563499451, "num_tokens": 509928474.0, "step": 13369 }, { "epoch": 1.7008014247551202, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.11076354980469, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8542479276657104, "num_tokens": 509968875.0, "step": 13370 }, { "epoch": 1.7009286350337107, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.56709289550781, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8610344529151917, "num_tokens": 510006021.0, "step": 13371 }, { "epoch": 1.7010558453123013, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.96686553955078, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.865570068359375, "num_tokens": 510042111.0, "step": 13372 }, { "epoch": 1.7011830555908918, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.25062561035156, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8672565817832947, "num_tokens": 510074894.0, "step": 13373 }, { "epoch": 1.7013102658694823, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.59823989868164, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8515765070915222, "num_tokens": 510111516.0, "step": 13374 }, { "epoch": 1.7014374761480728, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.70482635498047, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8642832636833191, "num_tokens": 510151549.0, "step": 13375 }, { "epoch": 1.7015646864266634, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.57345199584961, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8611112236976624, "num_tokens": 510189215.0, "step": 13376 }, { "epoch": 1.701691896705254, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.208412170410156, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8804118633270264, "num_tokens": 510231189.0, "step": 13377 }, { "epoch": 1.7018191069838444, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.62580108642578, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8648693561553955, "num_tokens": 510273956.0, "step": 13378 }, { "epoch": 1.701946317262435, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.954837799072266, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8786853551864624, "num_tokens": 510315095.0, "step": 13379 }, { "epoch": 1.7020735275410255, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.93783187866211, "learning_rate": 1e-06, "loss": 0.6381, "mean_token_accuracy": 0.8397668600082397, "num_tokens": 510350753.0, "step": 13380 }, { "epoch": 1.7022007378196158, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.79559326171875, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8665691018104553, "num_tokens": 510389257.0, "step": 13381 }, { "epoch": 1.7023279480982063, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.83081817626953, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8619245290756226, "num_tokens": 510428879.0, "step": 13382 }, { "epoch": 1.7024551583767968, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.5622444152832, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8644411563873291, "num_tokens": 510468409.0, "step": 13383 }, { "epoch": 1.7025823686553874, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.98738479614258, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8766834735870361, "num_tokens": 510503988.0, "step": 13384 }, { "epoch": 1.702709578933978, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.578369140625, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8594966530799866, "num_tokens": 510539009.0, "step": 13385 }, { "epoch": 1.7028367892125684, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.93734359741211, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8754705190658569, "num_tokens": 510576177.0, "step": 13386 }, { "epoch": 1.7029639994911587, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.157447814941406, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8609287738800049, "num_tokens": 510622194.0, "step": 13387 }, { "epoch": 1.7030912097697493, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.17781066894531, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8782158493995667, "num_tokens": 510663564.0, "step": 13388 }, { "epoch": 1.7032184200483398, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.37873840332031, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8709052205085754, "num_tokens": 510702153.0, "step": 13389 }, { "epoch": 1.7033456303269303, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.890838623046875, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8576335310935974, "num_tokens": 510742812.0, "step": 13390 }, { "epoch": 1.7034728406055208, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.12410354614258, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8553450703620911, "num_tokens": 510783363.0, "step": 13391 }, { "epoch": 1.7036000508841114, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.009029388427734, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8474369049072266, "num_tokens": 510818362.0, "step": 13392 }, { "epoch": 1.703727261162702, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.056617736816406, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8723849654197693, "num_tokens": 510854732.0, "step": 13393 }, { "epoch": 1.7038544714412924, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.04109573364258, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8548685312271118, "num_tokens": 510895107.0, "step": 13394 }, { "epoch": 1.703981681719883, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.16394805908203, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8576539754867554, "num_tokens": 510930737.0, "step": 13395 }, { "epoch": 1.7041088919984735, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.83537292480469, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8703079223632812, "num_tokens": 510964646.0, "step": 13396 }, { "epoch": 1.704236102277064, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.38839340209961, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8657007217407227, "num_tokens": 511000583.0, "step": 13397 }, { "epoch": 1.7043633125556545, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.847938537597656, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8633673191070557, "num_tokens": 511033729.0, "step": 13398 }, { "epoch": 1.704490522834245, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.19943618774414, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8673010468482971, "num_tokens": 511074099.0, "step": 13399 }, { "epoch": 1.7046177331128356, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.761741638183594, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8499813079833984, "num_tokens": 511112473.0, "step": 13400 }, { "epoch": 1.7047449433914261, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.40494155883789, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8616572618484497, "num_tokens": 511149839.0, "step": 13401 }, { "epoch": 1.7048721536700167, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.698646545410156, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8509925007820129, "num_tokens": 511183054.0, "step": 13402 }, { "epoch": 1.7049993639486072, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.77267837524414, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8655399680137634, "num_tokens": 511218498.0, "step": 13403 }, { "epoch": 1.7051265742271977, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.24853515625, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8587076663970947, "num_tokens": 511259776.0, "step": 13404 }, { "epoch": 1.705253784505788, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.11981201171875, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8675000667572021, "num_tokens": 511297331.0, "step": 13405 }, { "epoch": 1.7053809947843785, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.15381622314453, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8731613159179688, "num_tokens": 511331055.0, "step": 13406 }, { "epoch": 1.705508205062969, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.44324493408203, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8778389096260071, "num_tokens": 511368651.0, "step": 13407 }, { "epoch": 1.7056354153415596, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.575599670410156, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.8657544851303101, "num_tokens": 511402163.0, "step": 13408 }, { "epoch": 1.7057626256201501, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.02134323120117, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8539239168167114, "num_tokens": 511443769.0, "step": 13409 }, { "epoch": 1.7058898358987407, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.88916015625, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8588945865631104, "num_tokens": 511481038.0, "step": 13410 }, { "epoch": 1.706017046177331, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.02303695678711, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8645153045654297, "num_tokens": 511522940.0, "step": 13411 }, { "epoch": 1.7061442564559215, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.189414978027344, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.859189510345459, "num_tokens": 511562630.0, "step": 13412 }, { "epoch": 1.706271466734512, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.93994140625, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8469715118408203, "num_tokens": 511598986.0, "step": 13413 }, { "epoch": 1.7063986770131025, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.97324752807617, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8631236553192139, "num_tokens": 511633582.0, "step": 13414 }, { "epoch": 1.706525887291693, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.9297981262207, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8605740070343018, "num_tokens": 511673296.0, "step": 13415 }, { "epoch": 1.7066530975702836, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.961177825927734, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8544423580169678, "num_tokens": 511707890.0, "step": 13416 }, { "epoch": 1.7067803078488741, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.80570602416992, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8693983554840088, "num_tokens": 511741597.0, "step": 13417 }, { "epoch": 1.7069075181274647, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.65925598144531, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.865372896194458, "num_tokens": 511776816.0, "step": 13418 }, { "epoch": 1.7070347284060552, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.92672348022461, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8560925722122192, "num_tokens": 511814866.0, "step": 13419 }, { "epoch": 1.7071619386846457, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.821842193603516, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8619289398193359, "num_tokens": 511852458.0, "step": 13420 }, { "epoch": 1.7072891489632362, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.37508773803711, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8564700484275818, "num_tokens": 511891201.0, "step": 13421 }, { "epoch": 1.7074163592418268, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.86860656738281, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8612723350524902, "num_tokens": 511930862.0, "step": 13422 }, { "epoch": 1.7075435695204173, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.232051849365234, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8690187931060791, "num_tokens": 511967695.0, "step": 13423 }, { "epoch": 1.7076707797990078, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.30712890625, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.875239372253418, "num_tokens": 512005976.0, "step": 13424 }, { "epoch": 1.7077979900775984, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.688026428222656, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8622363209724426, "num_tokens": 512041575.0, "step": 13425 }, { "epoch": 1.7079252003561889, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.645877838134766, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8568055033683777, "num_tokens": 512076171.0, "step": 13426 }, { "epoch": 1.7080524106347794, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.973934173583984, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8564319610595703, "num_tokens": 512112082.0, "step": 13427 }, { "epoch": 1.70817962091337, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.963321685791016, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8479900360107422, "num_tokens": 512149126.0, "step": 13428 }, { "epoch": 1.7083068311919605, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.892364501953125, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8629266023635864, "num_tokens": 512182368.0, "step": 13429 }, { "epoch": 1.7084340414705508, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.313533782958984, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8427438735961914, "num_tokens": 512218224.0, "step": 13430 }, { "epoch": 1.7085612517491413, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.480979919433594, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.860866904258728, "num_tokens": 512255523.0, "step": 13431 }, { "epoch": 1.7086884620277318, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.22333526611328, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8717199563980103, "num_tokens": 512290514.0, "step": 13432 }, { "epoch": 1.7088156723063224, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.660621643066406, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8705756664276123, "num_tokens": 512328468.0, "step": 13433 }, { "epoch": 1.708942882584913, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.036476135253906, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8600659370422363, "num_tokens": 512359786.0, "step": 13434 }, { "epoch": 1.7090700928635034, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.72466278076172, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.83937007188797, "num_tokens": 512395679.0, "step": 13435 }, { "epoch": 1.7091973031420937, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.0048828125, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8579343557357788, "num_tokens": 512433491.0, "step": 13436 }, { "epoch": 1.7093245134206843, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.87796401977539, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8547242283821106, "num_tokens": 512470215.0, "step": 13437 }, { "epoch": 1.7094517236992748, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.98950958251953, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8574012517929077, "num_tokens": 512507055.0, "step": 13438 }, { "epoch": 1.7095789339778653, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.799137115478516, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8441279530525208, "num_tokens": 512547877.0, "step": 13439 }, { "epoch": 1.7097061442564558, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.149723052978516, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.847464919090271, "num_tokens": 512586558.0, "step": 13440 }, { "epoch": 1.7098333545350464, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.91111755371094, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.858679473400116, "num_tokens": 512631181.0, "step": 13441 }, { "epoch": 1.709960564813637, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.118072509765625, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8712419867515564, "num_tokens": 512667925.0, "step": 13442 }, { "epoch": 1.7100877750922274, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.80134963989258, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8447346687316895, "num_tokens": 512703136.0, "step": 13443 }, { "epoch": 1.710214985370818, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.996822357177734, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8639647364616394, "num_tokens": 512739001.0, "step": 13444 }, { "epoch": 1.7103421956494085, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.23381423950195, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8658486604690552, "num_tokens": 512780517.0, "step": 13445 }, { "epoch": 1.710469405927999, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.34941482543945, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8677977919578552, "num_tokens": 512815964.0, "step": 13446 }, { "epoch": 1.7105966162065895, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.03718566894531, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8639430999755859, "num_tokens": 512855451.0, "step": 13447 }, { "epoch": 1.71072382648518, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.43559265136719, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8658896088600159, "num_tokens": 512895809.0, "step": 13448 }, { "epoch": 1.7108510367637706, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.91583251953125, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8657798171043396, "num_tokens": 512936424.0, "step": 13449 }, { "epoch": 1.7109782470423611, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.1189079284668, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8757333159446716, "num_tokens": 512976486.0, "step": 13450 }, { "epoch": 1.7111054573209517, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.10297775268555, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8520797491073608, "num_tokens": 513014131.0, "step": 13451 }, { "epoch": 1.7112326675995422, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.01382827758789, "learning_rate": 1e-06, "loss": 0.6485, "mean_token_accuracy": 0.8326529264450073, "num_tokens": 513054706.0, "step": 13452 }, { "epoch": 1.7113598778781327, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.28790283203125, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8661903142929077, "num_tokens": 513094089.0, "step": 13453 }, { "epoch": 1.711487088156723, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.48530197143555, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8583776950836182, "num_tokens": 513136306.0, "step": 13454 }, { "epoch": 1.7116142984353135, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.14896011352539, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8571313619613647, "num_tokens": 513174376.0, "step": 13455 }, { "epoch": 1.711741508713904, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.97869110107422, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8638688325881958, "num_tokens": 513211834.0, "step": 13456 }, { "epoch": 1.7118687189924946, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.99390411376953, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8509756326675415, "num_tokens": 513241765.0, "step": 13457 }, { "epoch": 1.7119959292710851, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.28043746948242, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8480397462844849, "num_tokens": 513283510.0, "step": 13458 }, { "epoch": 1.7121231395496757, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.489601135253906, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.858582615852356, "num_tokens": 513320746.0, "step": 13459 }, { "epoch": 1.712250349828266, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.004493713378906, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8574138879776001, "num_tokens": 513358036.0, "step": 13460 }, { "epoch": 1.7123775601068565, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.07572937011719, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8663594722747803, "num_tokens": 513395156.0, "step": 13461 }, { "epoch": 1.712504770385447, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.89700698852539, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8557649850845337, "num_tokens": 513438631.0, "step": 13462 }, { "epoch": 1.7126319806640375, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.25202560424805, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.850501298904419, "num_tokens": 513471225.0, "step": 13463 }, { "epoch": 1.712759190942628, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.44550704956055, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8521126508712769, "num_tokens": 513513824.0, "step": 13464 }, { "epoch": 1.7128864012212186, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.2843017578125, "learning_rate": 1e-06, "loss": 0.5842, "mean_token_accuracy": 0.853018581867218, "num_tokens": 513556263.0, "step": 13465 }, { "epoch": 1.7130136114998091, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.633934020996094, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8649259805679321, "num_tokens": 513595093.0, "step": 13466 }, { "epoch": 1.7131408217783997, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.98503112792969, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8524840474128723, "num_tokens": 513633761.0, "step": 13467 }, { "epoch": 1.7132680320569902, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.71084976196289, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8581893444061279, "num_tokens": 513671404.0, "step": 13468 }, { "epoch": 1.7133952423355807, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.05457305908203, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8587716817855835, "num_tokens": 513713752.0, "step": 13469 }, { "epoch": 1.7135224526141712, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.54661560058594, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.856743335723877, "num_tokens": 513755261.0, "step": 13470 }, { "epoch": 1.7136496628927618, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.87843704223633, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8654909729957581, "num_tokens": 513799042.0, "step": 13471 }, { "epoch": 1.7137768731713523, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.82978057861328, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8495088219642639, "num_tokens": 513838148.0, "step": 13472 }, { "epoch": 1.7139040834499428, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.90168762207031, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8666058778762817, "num_tokens": 513878960.0, "step": 13473 }, { "epoch": 1.7140312937285334, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.84267807006836, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8627251386642456, "num_tokens": 513914083.0, "step": 13474 }, { "epoch": 1.7141585040071239, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.136322021484375, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8489779233932495, "num_tokens": 513951591.0, "step": 13475 }, { "epoch": 1.7142857142857144, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.74485397338867, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8455920219421387, "num_tokens": 513986127.0, "step": 13476 }, { "epoch": 1.714412924564305, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.06178283691406, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8555387258529663, "num_tokens": 514028248.0, "step": 13477 }, { "epoch": 1.7145401348428955, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.83680725097656, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8632325530052185, "num_tokens": 514062757.0, "step": 13478 }, { "epoch": 1.7146673451214858, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.66592025756836, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8474600315093994, "num_tokens": 514102635.0, "step": 13479 }, { "epoch": 1.7147945554000763, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.08182144165039, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8588709831237793, "num_tokens": 514140883.0, "step": 13480 }, { "epoch": 1.7149217656786668, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.71169662475586, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8700498342514038, "num_tokens": 514176536.0, "step": 13481 }, { "epoch": 1.7150489759572574, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.87822341918945, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8543789982795715, "num_tokens": 514209969.0, "step": 13482 }, { "epoch": 1.7151761862358479, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.80426025390625, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8588822484016418, "num_tokens": 514249679.0, "step": 13483 }, { "epoch": 1.7153033965144384, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.30491256713867, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8549901247024536, "num_tokens": 514292719.0, "step": 13484 }, { "epoch": 1.7154306067930287, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.45463943481445, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8531396985054016, "num_tokens": 514323396.0, "step": 13485 }, { "epoch": 1.7155578170716193, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.269039154052734, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8666890263557434, "num_tokens": 514360951.0, "step": 13486 }, { "epoch": 1.7156850273502098, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.69545364379883, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8448870778083801, "num_tokens": 514403879.0, "step": 13487 }, { "epoch": 1.7158122376288003, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.70783233642578, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8677148818969727, "num_tokens": 514446628.0, "step": 13488 }, { "epoch": 1.7159394479073908, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.60836410522461, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8601634502410889, "num_tokens": 514486132.0, "step": 13489 }, { "epoch": 1.7160666581859814, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.49182891845703, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8480072021484375, "num_tokens": 514526384.0, "step": 13490 }, { "epoch": 1.716193868464572, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.45840072631836, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8585015535354614, "num_tokens": 514565828.0, "step": 13491 }, { "epoch": 1.7163210787431624, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.28523254394531, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8555582761764526, "num_tokens": 514605260.0, "step": 13492 }, { "epoch": 1.716448289021753, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.936485290527344, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8614389896392822, "num_tokens": 514643433.0, "step": 13493 }, { "epoch": 1.7165754993003435, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.848716735839844, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8628910779953003, "num_tokens": 514683264.0, "step": 13494 }, { "epoch": 1.716702709578934, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.632408142089844, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8680896162986755, "num_tokens": 514719775.0, "step": 13495 }, { "epoch": 1.7168299198575245, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.162841796875, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8527206182479858, "num_tokens": 514766287.0, "step": 13496 }, { "epoch": 1.716957130136115, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.64588928222656, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8693422079086304, "num_tokens": 514806725.0, "step": 13497 }, { "epoch": 1.7170843404147056, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.4915657043457, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8641126155853271, "num_tokens": 514844008.0, "step": 13498 }, { "epoch": 1.7172115506932961, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.73400115966797, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8479378819465637, "num_tokens": 514880356.0, "step": 13499 }, { "epoch": 1.7173387609718866, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.33453369140625, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.871256411075592, "num_tokens": 514915309.0, "step": 13500 }, { "epoch": 1.7174659712504772, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.348236083984375, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8541203141212463, "num_tokens": 514956496.0, "step": 13501 }, { "epoch": 1.7175931815290677, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.95847702026367, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8607383966445923, "num_tokens": 514992247.0, "step": 13502 }, { "epoch": 1.717720391807658, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.23387908935547, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.853227972984314, "num_tokens": 515035443.0, "step": 13503 }, { "epoch": 1.7178476020862485, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.253639221191406, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8651409149169922, "num_tokens": 515072279.0, "step": 13504 }, { "epoch": 1.717974812364839, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.8626594543457, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8589221239089966, "num_tokens": 515112547.0, "step": 13505 }, { "epoch": 1.7181020226434296, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.16781997680664, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8474321961402893, "num_tokens": 515146814.0, "step": 13506 }, { "epoch": 1.7182292329220201, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.029197692871094, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8624811172485352, "num_tokens": 515184393.0, "step": 13507 }, { "epoch": 1.7183564432006107, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.31108856201172, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8527669906616211, "num_tokens": 515225312.0, "step": 13508 }, { "epoch": 1.718483653479201, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.030704498291016, "learning_rate": 1e-06, "loss": 0.6368, "mean_token_accuracy": 0.838833749294281, "num_tokens": 515261330.0, "step": 13509 }, { "epoch": 1.7186108637577915, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.458396911621094, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8654865026473999, "num_tokens": 515299325.0, "step": 13510 }, { "epoch": 1.718738074036382, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.192115783691406, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8601530194282532, "num_tokens": 515335213.0, "step": 13511 }, { "epoch": 1.7188652843149725, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.35717010498047, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8709648251533508, "num_tokens": 515365699.0, "step": 13512 }, { "epoch": 1.718992494593563, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.8403434753418, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.867864727973938, "num_tokens": 515396025.0, "step": 13513 }, { "epoch": 1.7191197048721536, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.86383056640625, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8619937300682068, "num_tokens": 515429724.0, "step": 13514 }, { "epoch": 1.7192469151507441, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.63578796386719, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8560656309127808, "num_tokens": 515465894.0, "step": 13515 }, { "epoch": 1.7193741254293347, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.421958923339844, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8789904117584229, "num_tokens": 515498894.0, "step": 13516 }, { "epoch": 1.7195013357079252, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.90861511230469, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8598970770835876, "num_tokens": 515538783.0, "step": 13517 }, { "epoch": 1.7196285459865157, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.35970687866211, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8589510917663574, "num_tokens": 515573107.0, "step": 13518 }, { "epoch": 1.7197557562651062, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.908653259277344, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8628398776054382, "num_tokens": 515614901.0, "step": 13519 }, { "epoch": 1.7198829665436968, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.9039192199707, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8639592528343201, "num_tokens": 515654197.0, "step": 13520 }, { "epoch": 1.7200101768222873, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.10070037841797, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8783218264579773, "num_tokens": 515697003.0, "step": 13521 }, { "epoch": 1.7201373871008778, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.573341369628906, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8570964336395264, "num_tokens": 515737801.0, "step": 13522 }, { "epoch": 1.7202645973794684, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.45921325683594, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8518813848495483, "num_tokens": 515774711.0, "step": 13523 }, { "epoch": 1.7203918076580589, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.539798736572266, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8520643711090088, "num_tokens": 515811728.0, "step": 13524 }, { "epoch": 1.7205190179366494, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.53411865234375, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8659875392913818, "num_tokens": 515844425.0, "step": 13525 }, { "epoch": 1.72064622821524, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.121849060058594, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8613595366477966, "num_tokens": 515882439.0, "step": 13526 }, { "epoch": 1.7207734384938305, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 44.16061782836914, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.839214563369751, "num_tokens": 515922168.0, "step": 13527 }, { "epoch": 1.7209006487724208, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.09825897216797, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8681466579437256, "num_tokens": 515961840.0, "step": 13528 }, { "epoch": 1.7210278590510113, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.138710021972656, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8592108488082886, "num_tokens": 515997586.0, "step": 13529 }, { "epoch": 1.7211550693296018, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.80293273925781, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8655177354812622, "num_tokens": 516036660.0, "step": 13530 }, { "epoch": 1.7212822796081924, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.07633972167969, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8522434830665588, "num_tokens": 516070846.0, "step": 13531 }, { "epoch": 1.7214094898867829, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.79867935180664, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8824607133865356, "num_tokens": 516107027.0, "step": 13532 }, { "epoch": 1.7215367001653734, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 42.911983489990234, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8605991005897522, "num_tokens": 516143971.0, "step": 13533 }, { "epoch": 1.7216639104439637, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 44.062339782714844, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.861882209777832, "num_tokens": 516180617.0, "step": 13534 }, { "epoch": 1.7217911207225542, "ewc_loss": 0.1455078125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001239776611328125, "grad_norm": 43.217559814453125, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8619661331176758, "num_tokens": 516220371.0, "step": 13535 }, { "epoch": 1.7219183310011448, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.49018859863281, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8633595108985901, "num_tokens": 516258642.0, "step": 13536 }, { "epoch": 1.7220455412797353, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 43.68326187133789, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8536673784255981, "num_tokens": 516298971.0, "step": 13537 }, { "epoch": 1.7221727515583258, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.81145095825195, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8725968599319458, "num_tokens": 516334457.0, "step": 13538 }, { "epoch": 1.7222999618369164, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.215843200683594, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8856150507926941, "num_tokens": 516374585.0, "step": 13539 }, { "epoch": 1.7224271721155069, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.326419830322266, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8480321168899536, "num_tokens": 516414180.0, "step": 13540 }, { "epoch": 1.7225543823940974, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.2219352722168, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8684232234954834, "num_tokens": 516450189.0, "step": 13541 }, { "epoch": 1.722681592672688, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.1706428527832, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8592957258224487, "num_tokens": 516487549.0, "step": 13542 }, { "epoch": 1.7228088029512785, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.6330451965332, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8464311957359314, "num_tokens": 516526649.0, "step": 13543 }, { "epoch": 1.722936013229869, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.508056640625, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8654824495315552, "num_tokens": 516563829.0, "step": 13544 }, { "epoch": 1.7230632235084595, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.39644241333008, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8432662487030029, "num_tokens": 516605179.0, "step": 13545 }, { "epoch": 1.72319043378705, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.46376037597656, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8588412404060364, "num_tokens": 516641576.0, "step": 13546 }, { "epoch": 1.7233176440656406, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.795135498046875, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8567322492599487, "num_tokens": 516678687.0, "step": 13547 }, { "epoch": 1.7234448543442311, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.48725891113281, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8546510934829712, "num_tokens": 516711484.0, "step": 13548 }, { "epoch": 1.7235720646228216, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.539703369140625, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8563870787620544, "num_tokens": 516750598.0, "step": 13549 }, { "epoch": 1.7236992749014122, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 44.14253616333008, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8673135638237, "num_tokens": 516793438.0, "step": 13550 }, { "epoch": 1.7238264851800027, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.78749084472656, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.874305009841919, "num_tokens": 516828692.0, "step": 13551 }, { "epoch": 1.723953695458593, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.82344436645508, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8603876829147339, "num_tokens": 516867592.0, "step": 13552 }, { "epoch": 1.7240809057371835, "ewc_loss": 0.146484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012493133544921875, "grad_norm": 42.988487243652344, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8490111827850342, "num_tokens": 516907668.0, "step": 13553 }, { "epoch": 1.724208116015774, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.56807327270508, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8579636812210083, "num_tokens": 516944628.0, "step": 13554 }, { "epoch": 1.7243353262943646, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.061344146728516, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8548218011856079, "num_tokens": 516989027.0, "step": 13555 }, { "epoch": 1.7244625365729551, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.278228759765625, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8553311228752136, "num_tokens": 517024464.0, "step": 13556 }, { "epoch": 1.7245897468515456, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.22621154785156, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8649142980575562, "num_tokens": 517067714.0, "step": 13557 }, { "epoch": 1.724716957130136, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.0643424987793, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8626452684402466, "num_tokens": 517107564.0, "step": 13558 }, { "epoch": 1.7248441674087265, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.04953384399414, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8642793297767639, "num_tokens": 517148240.0, "step": 13559 }, { "epoch": 1.724971377687317, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.438438415527344, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8548699617385864, "num_tokens": 517187841.0, "step": 13560 }, { "epoch": 1.7250985879659075, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.25737380981445, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.844060480594635, "num_tokens": 517225732.0, "step": 13561 }, { "epoch": 1.725225798244498, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.43037796020508, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8437508344650269, "num_tokens": 517260423.0, "step": 13562 }, { "epoch": 1.7253530085230886, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.164329528808594, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.846520721912384, "num_tokens": 517300862.0, "step": 13563 }, { "epoch": 1.7254802188016791, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.54021453857422, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.864293098449707, "num_tokens": 517341010.0, "step": 13564 }, { "epoch": 1.7256074290802697, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 44.01004409790039, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8433477282524109, "num_tokens": 517376853.0, "step": 13565 }, { "epoch": 1.7257346393588602, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.71855926513672, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.854339063167572, "num_tokens": 517421854.0, "step": 13566 }, { "epoch": 1.7258618496374507, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.62025451660156, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8536575436592102, "num_tokens": 517461711.0, "step": 13567 }, { "epoch": 1.7259890599160412, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.526004791259766, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8664504289627075, "num_tokens": 517494755.0, "step": 13568 }, { "epoch": 1.7261162701946318, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 44.11207962036133, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8518451452255249, "num_tokens": 517539061.0, "step": 13569 }, { "epoch": 1.7262434804732223, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.20979309082031, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8563475608825684, "num_tokens": 517579768.0, "step": 13570 }, { "epoch": 1.7263706907518128, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.77154541015625, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8716890215873718, "num_tokens": 517612006.0, "step": 13571 }, { "epoch": 1.7264979010304033, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.36435317993164, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8599667549133301, "num_tokens": 517647547.0, "step": 13572 }, { "epoch": 1.7266251113089939, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.744102478027344, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.849953830242157, "num_tokens": 517689067.0, "step": 13573 }, { "epoch": 1.7267523215875844, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.38484191894531, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8654376864433289, "num_tokens": 517729967.0, "step": 13574 }, { "epoch": 1.726879531866175, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.570003509521484, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8593959808349609, "num_tokens": 517770427.0, "step": 13575 }, { "epoch": 1.7270067421447655, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.79775619506836, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8572880029678345, "num_tokens": 517804202.0, "step": 13576 }, { "epoch": 1.7271339524233558, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.27739715576172, "learning_rate": 1e-06, "loss": 0.5842, "mean_token_accuracy": 0.8523104190826416, "num_tokens": 517834825.0, "step": 13577 }, { "epoch": 1.7272611627019463, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.64414596557617, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8530969619750977, "num_tokens": 517874600.0, "step": 13578 }, { "epoch": 1.7273883729805368, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.96847915649414, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8486844301223755, "num_tokens": 517913488.0, "step": 13579 }, { "epoch": 1.7275155832591274, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.20865249633789, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8737138509750366, "num_tokens": 517951094.0, "step": 13580 }, { "epoch": 1.7276427935377179, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.76616668701172, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8492558002471924, "num_tokens": 517993811.0, "step": 13581 }, { "epoch": 1.7277700038163084, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.840965270996094, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8621073961257935, "num_tokens": 518032860.0, "step": 13582 }, { "epoch": 1.7278972140948987, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.67600631713867, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8556568622589111, "num_tokens": 518070256.0, "step": 13583 }, { "epoch": 1.7280244243734892, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.94007110595703, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8658251762390137, "num_tokens": 518102828.0, "step": 13584 }, { "epoch": 1.7281516346520798, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.484703063964844, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8731107115745544, "num_tokens": 518146859.0, "step": 13585 }, { "epoch": 1.7282788449306703, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.9208869934082, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8674424290657043, "num_tokens": 518186173.0, "step": 13586 }, { "epoch": 1.7284060552092608, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.4041633605957, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8444383144378662, "num_tokens": 518221903.0, "step": 13587 }, { "epoch": 1.7285332654878514, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.5941162109375, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8503226041793823, "num_tokens": 518258048.0, "step": 13588 }, { "epoch": 1.7286604757664419, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.24795913696289, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8498805165290833, "num_tokens": 518299127.0, "step": 13589 }, { "epoch": 1.7287876860450324, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.531211853027344, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8696858882904053, "num_tokens": 518336520.0, "step": 13590 }, { "epoch": 1.728914896323623, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.92233657836914, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8555774688720703, "num_tokens": 518368484.0, "step": 13591 }, { "epoch": 1.7290421066022135, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.86464309692383, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8570714592933655, "num_tokens": 518408439.0, "step": 13592 }, { "epoch": 1.729169316880804, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 44.03388214111328, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8399844169616699, "num_tokens": 518447414.0, "step": 13593 }, { "epoch": 1.7292965271593945, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.00374984741211, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8552363514900208, "num_tokens": 518486455.0, "step": 13594 }, { "epoch": 1.729423737437985, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.88279342651367, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8765005469322205, "num_tokens": 518527023.0, "step": 13595 }, { "epoch": 1.7295509477165756, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 43.275901794433594, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8542009592056274, "num_tokens": 518563285.0, "step": 13596 }, { "epoch": 1.729678157995166, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.300758361816406, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8602284789085388, "num_tokens": 518607610.0, "step": 13597 }, { "epoch": 1.7298053682737566, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.46908950805664, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.869859516620636, "num_tokens": 518649434.0, "step": 13598 }, { "epoch": 1.7299325785523472, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.32818603515625, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8668015599250793, "num_tokens": 518684508.0, "step": 13599 }, { "epoch": 1.7300597888309377, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.91652297973633, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8646978139877319, "num_tokens": 518717883.0, "step": 13600 }, { "epoch": 1.730186999109528, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.05897903442383, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8655326962471008, "num_tokens": 518756158.0, "step": 13601 }, { "epoch": 1.7303142093881185, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.869869232177734, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8593142032623291, "num_tokens": 518793003.0, "step": 13602 }, { "epoch": 1.730441419666709, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.904178619384766, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8538268804550171, "num_tokens": 518838485.0, "step": 13603 }, { "epoch": 1.7305686299452996, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.561607360839844, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8527411222457886, "num_tokens": 518885936.0, "step": 13604 }, { "epoch": 1.7306958402238901, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.49205017089844, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8469021916389465, "num_tokens": 518925684.0, "step": 13605 }, { "epoch": 1.7308230505024806, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.60385513305664, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8609224557876587, "num_tokens": 518966438.0, "step": 13606 }, { "epoch": 1.730950260781071, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.79873275756836, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8627341985702515, "num_tokens": 519013576.0, "step": 13607 }, { "epoch": 1.7310774710596615, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.943599700927734, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8561556935310364, "num_tokens": 519045342.0, "step": 13608 }, { "epoch": 1.731204681338252, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.82181167602539, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8453027009963989, "num_tokens": 519088566.0, "step": 13609 }, { "epoch": 1.7313318916168425, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.52373123168945, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8623509407043457, "num_tokens": 519126516.0, "step": 13610 }, { "epoch": 1.731459101895433, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.48841857910156, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8683683276176453, "num_tokens": 519165431.0, "step": 13611 }, { "epoch": 1.7315863121740236, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.546417236328125, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8748194575309753, "num_tokens": 519200991.0, "step": 13612 }, { "epoch": 1.7317135224526141, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.862064361572266, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8666179180145264, "num_tokens": 519245988.0, "step": 13613 }, { "epoch": 1.7318407327312046, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.980926513671875, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8642492294311523, "num_tokens": 519282595.0, "step": 13614 }, { "epoch": 1.7319679430097952, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.87773895263672, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8537521362304688, "num_tokens": 519319061.0, "step": 13615 }, { "epoch": 1.7320951532883857, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.691402435302734, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8562742471694946, "num_tokens": 519358617.0, "step": 13616 }, { "epoch": 1.7322223635669762, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.30808639526367, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8586747646331787, "num_tokens": 519392461.0, "step": 13617 }, { "epoch": 1.7323495738455668, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.59336471557617, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8672200441360474, "num_tokens": 519432893.0, "step": 13618 }, { "epoch": 1.7324767841241573, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.95369338989258, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.860692024230957, "num_tokens": 519473445.0, "step": 13619 }, { "epoch": 1.7326039944027478, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 44.027957916259766, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8602482080459595, "num_tokens": 519518883.0, "step": 13620 }, { "epoch": 1.7327312046813383, "ewc_loss": 0.1474609375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000125885009765625, "grad_norm": 42.83830642700195, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8638240098953247, "num_tokens": 519550123.0, "step": 13621 }, { "epoch": 1.7328584149599289, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 44.08574676513672, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8501905202865601, "num_tokens": 519592538.0, "step": 13622 }, { "epoch": 1.7329856252385194, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 42.94550323486328, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8594523668289185, "num_tokens": 519633466.0, "step": 13623 }, { "epoch": 1.73311283551711, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.78798294067383, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8554109334945679, "num_tokens": 519672569.0, "step": 13624 }, { "epoch": 1.7332400457957005, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.17001724243164, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8658468127250671, "num_tokens": 519707383.0, "step": 13625 }, { "epoch": 1.7333672560742908, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.88179397583008, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8417726755142212, "num_tokens": 519748187.0, "step": 13626 }, { "epoch": 1.7334944663528813, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.25154495239258, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8584197759628296, "num_tokens": 519790215.0, "step": 13627 }, { "epoch": 1.7336216766314718, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.593650817871094, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8512319922447205, "num_tokens": 519827278.0, "step": 13628 }, { "epoch": 1.7337488869100623, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.675567626953125, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8666829466819763, "num_tokens": 519860216.0, "step": 13629 }, { "epoch": 1.7338760971886529, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.118682861328125, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8581792116165161, "num_tokens": 519894442.0, "step": 13630 }, { "epoch": 1.7340033074672434, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.46623229980469, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8591243028640747, "num_tokens": 519932524.0, "step": 13631 }, { "epoch": 1.7341305177458337, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.4727668762207, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8593273758888245, "num_tokens": 519974366.0, "step": 13632 }, { "epoch": 1.7342577280244242, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.03901290893555, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8695206642150879, "num_tokens": 520013983.0, "step": 13633 }, { "epoch": 1.7343849383030148, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.06010055541992, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8631547689437866, "num_tokens": 520047671.0, "step": 13634 }, { "epoch": 1.7345121485816053, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.5201301574707, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.855614423751831, "num_tokens": 520086574.0, "step": 13635 }, { "epoch": 1.7346393588601958, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.0037727355957, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8452770113945007, "num_tokens": 520130613.0, "step": 13636 }, { "epoch": 1.7347665691387864, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.3244514465332, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8767377734184265, "num_tokens": 520163763.0, "step": 13637 }, { "epoch": 1.7348937794173769, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.51005935668945, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8572448492050171, "num_tokens": 520198663.0, "step": 13638 }, { "epoch": 1.7350209896959674, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.71980667114258, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8624471426010132, "num_tokens": 520231737.0, "step": 13639 }, { "epoch": 1.735148199974558, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.06547546386719, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8655180335044861, "num_tokens": 520275664.0, "step": 13640 }, { "epoch": 1.7352754102531485, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.424339294433594, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.848101019859314, "num_tokens": 520316100.0, "step": 13641 }, { "epoch": 1.735402620531739, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.973793029785156, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8561011552810669, "num_tokens": 520348167.0, "step": 13642 }, { "epoch": 1.7355298308103295, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.49482727050781, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8467170000076294, "num_tokens": 520385234.0, "step": 13643 }, { "epoch": 1.73565704108892, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.877559661865234, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8438261151313782, "num_tokens": 520423051.0, "step": 13644 }, { "epoch": 1.7357842513675106, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.749549865722656, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8504631519317627, "num_tokens": 520467202.0, "step": 13645 }, { "epoch": 1.735911461646101, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.911163330078125, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8590152859687805, "num_tokens": 520501434.0, "step": 13646 }, { "epoch": 1.7360386719246916, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.81688690185547, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8551137447357178, "num_tokens": 520542757.0, "step": 13647 }, { "epoch": 1.7361658822032822, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.107181549072266, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.850593090057373, "num_tokens": 520585216.0, "step": 13648 }, { "epoch": 1.7362930924818727, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.17830276489258, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8598036766052246, "num_tokens": 520620066.0, "step": 13649 }, { "epoch": 1.736420302760463, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.656715393066406, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8669598698616028, "num_tokens": 520660917.0, "step": 13650 }, { "epoch": 1.7365475130390535, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.75432586669922, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8500563502311707, "num_tokens": 520704255.0, "step": 13651 }, { "epoch": 1.736674723317644, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.956050872802734, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8450021743774414, "num_tokens": 520743036.0, "step": 13652 }, { "epoch": 1.7368019335962346, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.48217010498047, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8529435396194458, "num_tokens": 520778325.0, "step": 13653 }, { "epoch": 1.736929143874825, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 44.30016326904297, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8725321292877197, "num_tokens": 520815226.0, "step": 13654 }, { "epoch": 1.7370563541534156, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.12800216674805, "learning_rate": 1e-06, "loss": 0.6372, "mean_token_accuracy": 0.8395167589187622, "num_tokens": 520856606.0, "step": 13655 }, { "epoch": 1.737183564432006, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.43047332763672, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.849614143371582, "num_tokens": 520893493.0, "step": 13656 }, { "epoch": 1.7373107747105965, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.035701751708984, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8662736415863037, "num_tokens": 520930391.0, "step": 13657 }, { "epoch": 1.737437984989187, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.56233215332031, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.859594464302063, "num_tokens": 520970512.0, "step": 13658 }, { "epoch": 1.7375651952677775, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.46297836303711, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8508614301681519, "num_tokens": 521011787.0, "step": 13659 }, { "epoch": 1.737692405546368, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.51110076904297, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8612691164016724, "num_tokens": 521053494.0, "step": 13660 }, { "epoch": 1.7378196158249586, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.42616653442383, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8469390869140625, "num_tokens": 521089291.0, "step": 13661 }, { "epoch": 1.7379468261035491, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.15983581542969, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8706007599830627, "num_tokens": 521128910.0, "step": 13662 }, { "epoch": 1.7380740363821396, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.61025619506836, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8549834489822388, "num_tokens": 521168175.0, "step": 13663 }, { "epoch": 1.7382012466607302, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.97757339477539, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8560041189193726, "num_tokens": 521208020.0, "step": 13664 }, { "epoch": 1.7383284569393207, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.690643310546875, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8706265091896057, "num_tokens": 521249828.0, "step": 13665 }, { "epoch": 1.7384556672179112, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.97037124633789, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8616811037063599, "num_tokens": 521289940.0, "step": 13666 }, { "epoch": 1.7385828774965018, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.507965087890625, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8776690363883972, "num_tokens": 521327426.0, "step": 13667 }, { "epoch": 1.7387100877750923, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.880699157714844, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.849838376045227, "num_tokens": 521372112.0, "step": 13668 }, { "epoch": 1.7388372980536828, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.29307556152344, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8584659099578857, "num_tokens": 521410842.0, "step": 13669 }, { "epoch": 1.7389645083322733, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.31843185424805, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.869523286819458, "num_tokens": 521446854.0, "step": 13670 }, { "epoch": 1.7390917186108639, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.608062744140625, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8489375114440918, "num_tokens": 521486284.0, "step": 13671 }, { "epoch": 1.7392189288894544, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.15925598144531, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8452056050300598, "num_tokens": 521522716.0, "step": 13672 }, { "epoch": 1.739346139168045, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.2603759765625, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8531458377838135, "num_tokens": 521551549.0, "step": 13673 }, { "epoch": 1.7394733494466355, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.5587043762207, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8621779680252075, "num_tokens": 521589552.0, "step": 13674 }, { "epoch": 1.7396005597252258, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.393272399902344, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.871096670627594, "num_tokens": 521627906.0, "step": 13675 }, { "epoch": 1.7397277700038163, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.52530288696289, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8430581092834473, "num_tokens": 521666615.0, "step": 13676 }, { "epoch": 1.7398549802824068, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.20216369628906, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.856853723526001, "num_tokens": 521707742.0, "step": 13677 }, { "epoch": 1.7399821905609973, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.73517608642578, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8478283286094666, "num_tokens": 521746848.0, "step": 13678 }, { "epoch": 1.7401094008395879, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.01027297973633, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8602113127708435, "num_tokens": 521782917.0, "step": 13679 }, { "epoch": 1.7402366111181784, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.820533752441406, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8564423322677612, "num_tokens": 521825609.0, "step": 13680 }, { "epoch": 1.7403638213967687, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.15326690673828, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8722493052482605, "num_tokens": 521862277.0, "step": 13681 }, { "epoch": 1.7404910316753592, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.840946197509766, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8783491849899292, "num_tokens": 521894951.0, "step": 13682 }, { "epoch": 1.7406182419539498, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.035430908203125, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8586239814758301, "num_tokens": 521932158.0, "step": 13683 }, { "epoch": 1.7407454522325403, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 44.00890350341797, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8663063049316406, "num_tokens": 521971050.0, "step": 13684 }, { "epoch": 1.7408726625111308, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.0337028503418, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8706664443016052, "num_tokens": 522007617.0, "step": 13685 }, { "epoch": 1.7409998727897213, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.935428619384766, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8476545214653015, "num_tokens": 522053484.0, "step": 13686 }, { "epoch": 1.7411270830683119, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.400569915771484, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8551966547966003, "num_tokens": 522088047.0, "step": 13687 }, { "epoch": 1.7412542933469024, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.787960052490234, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8589398860931396, "num_tokens": 522129865.0, "step": 13688 }, { "epoch": 1.741381503625493, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.62525939941406, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8549649715423584, "num_tokens": 522170154.0, "step": 13689 }, { "epoch": 1.7415087139040835, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.691959381103516, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8834027051925659, "num_tokens": 522204586.0, "step": 13690 }, { "epoch": 1.741635924182674, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.246402740478516, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8635098934173584, "num_tokens": 522241942.0, "step": 13691 }, { "epoch": 1.7417631344612645, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.67947006225586, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8558917045593262, "num_tokens": 522275800.0, "step": 13692 }, { "epoch": 1.741890344739855, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.2602424621582, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8537654876708984, "num_tokens": 522317024.0, "step": 13693 }, { "epoch": 1.7420175550184456, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.59629821777344, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8699610233306885, "num_tokens": 522358510.0, "step": 13694 }, { "epoch": 1.742144765297036, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.174072265625, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8506868481636047, "num_tokens": 522395960.0, "step": 13695 }, { "epoch": 1.7422719755756266, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.707496643066406, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8676429390907288, "num_tokens": 522428807.0, "step": 13696 }, { "epoch": 1.7423991858542172, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.928531646728516, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8596454858779907, "num_tokens": 522464472.0, "step": 13697 }, { "epoch": 1.7425263961328077, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 44.005496978759766, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8734304904937744, "num_tokens": 522509522.0, "step": 13698 }, { "epoch": 1.742653606411398, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.20159912109375, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8450726270675659, "num_tokens": 522545581.0, "step": 13699 }, { "epoch": 1.7427808166899885, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.5742301940918, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8618946075439453, "num_tokens": 522582072.0, "step": 13700 }, { "epoch": 1.742908026968579, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.18754196166992, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8625650405883789, "num_tokens": 522618183.0, "step": 13701 }, { "epoch": 1.7430352372471696, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.53696060180664, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8485949635505676, "num_tokens": 522660578.0, "step": 13702 }, { "epoch": 1.74316244752576, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.98817443847656, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8715907335281372, "num_tokens": 522701934.0, "step": 13703 }, { "epoch": 1.7432896578043506, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.383113861083984, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8784791231155396, "num_tokens": 522737404.0, "step": 13704 }, { "epoch": 1.743416868082941, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.73176956176758, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8549593687057495, "num_tokens": 522774188.0, "step": 13705 }, { "epoch": 1.7435440783615315, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.54335403442383, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8691510558128357, "num_tokens": 522810467.0, "step": 13706 }, { "epoch": 1.743671288640122, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.32612991333008, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8482781648635864, "num_tokens": 522845246.0, "step": 13707 }, { "epoch": 1.7437984989187125, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.51039505004883, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8693393468856812, "num_tokens": 522877150.0, "step": 13708 }, { "epoch": 1.743925709197303, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.4172477722168, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8731134533882141, "num_tokens": 522913161.0, "step": 13709 }, { "epoch": 1.7440529194758936, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.1544075012207, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8450636863708496, "num_tokens": 522953464.0, "step": 13710 }, { "epoch": 1.744180129754484, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.62587356567383, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8632623553276062, "num_tokens": 522996874.0, "step": 13711 }, { "epoch": 1.7443073400330746, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.98269271850586, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8610665798187256, "num_tokens": 523031349.0, "step": 13712 }, { "epoch": 1.7444345503116652, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.726497650146484, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8516771793365479, "num_tokens": 523068910.0, "step": 13713 }, { "epoch": 1.7445617605902557, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.9228630065918, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8737499117851257, "num_tokens": 523106179.0, "step": 13714 }, { "epoch": 1.7446889708688462, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.61203384399414, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8612363934516907, "num_tokens": 523140827.0, "step": 13715 }, { "epoch": 1.7448161811474368, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.94957733154297, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8734357357025146, "num_tokens": 523182880.0, "step": 13716 }, { "epoch": 1.7449433914260273, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.52451705932617, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8641991019248962, "num_tokens": 523213059.0, "step": 13717 }, { "epoch": 1.7450706017046178, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.94305419921875, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8580605387687683, "num_tokens": 523254398.0, "step": 13718 }, { "epoch": 1.7451978119832083, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.62558364868164, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8724758625030518, "num_tokens": 523293325.0, "step": 13719 }, { "epoch": 1.7453250222617989, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.89828109741211, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8500990867614746, "num_tokens": 523334402.0, "step": 13720 }, { "epoch": 1.7454522325403894, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.867984771728516, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8645386695861816, "num_tokens": 523376238.0, "step": 13721 }, { "epoch": 1.74557944281898, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.56773376464844, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.880121111869812, "num_tokens": 523417074.0, "step": 13722 }, { "epoch": 1.7457066530975704, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.89564514160156, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8664151430130005, "num_tokens": 523457984.0, "step": 13723 }, { "epoch": 1.7458338633761608, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.83158874511719, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8566532135009766, "num_tokens": 523490071.0, "step": 13724 }, { "epoch": 1.7459610736547513, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.30488967895508, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.854326605796814, "num_tokens": 523530105.0, "step": 13725 }, { "epoch": 1.7460882839333418, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.79087448120117, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.87441086769104, "num_tokens": 523576698.0, "step": 13726 }, { "epoch": 1.7462154942119323, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.747196197509766, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8614761233329773, "num_tokens": 523614387.0, "step": 13727 }, { "epoch": 1.7463427044905229, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.00372314453125, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8549435138702393, "num_tokens": 523651149.0, "step": 13728 }, { "epoch": 1.7464699147691134, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.38880157470703, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.872046709060669, "num_tokens": 523688448.0, "step": 13729 }, { "epoch": 1.7465971250477037, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.5198860168457, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.8640320301055908, "num_tokens": 523725805.0, "step": 13730 }, { "epoch": 1.7467243353262942, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.98439025878906, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8605764508247375, "num_tokens": 523763564.0, "step": 13731 }, { "epoch": 1.7468515456048848, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.243316650390625, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8556346893310547, "num_tokens": 523804422.0, "step": 13732 }, { "epoch": 1.7469787558834753, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.91801452636719, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8693416714668274, "num_tokens": 523845889.0, "step": 13733 }, { "epoch": 1.7471059661620658, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.55856704711914, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8719255924224854, "num_tokens": 523883623.0, "step": 13734 }, { "epoch": 1.7472331764406563, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.85695266723633, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8660556077957153, "num_tokens": 523917273.0, "step": 13735 }, { "epoch": 1.7473603867192469, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.49740982055664, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8575940132141113, "num_tokens": 523956149.0, "step": 13736 }, { "epoch": 1.7474875969978374, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.13233184814453, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8589035272598267, "num_tokens": 523997229.0, "step": 13737 }, { "epoch": 1.747614807276428, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.554683685302734, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8653315305709839, "num_tokens": 524032921.0, "step": 13738 }, { "epoch": 1.7477420175550185, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.22443771362305, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8625496625900269, "num_tokens": 524065016.0, "step": 13739 }, { "epoch": 1.747869227833609, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.564178466796875, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8485118746757507, "num_tokens": 524107001.0, "step": 13740 }, { "epoch": 1.7479964381121995, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.18648910522461, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8545412421226501, "num_tokens": 524149283.0, "step": 13741 }, { "epoch": 1.74812364839079, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.1283073425293, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8609806895256042, "num_tokens": 524187240.0, "step": 13742 }, { "epoch": 1.7482508586693806, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.203041076660156, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8691703081130981, "num_tokens": 524216793.0, "step": 13743 }, { "epoch": 1.748378068947971, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.97303771972656, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8657786846160889, "num_tokens": 524257629.0, "step": 13744 }, { "epoch": 1.7485052792265616, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.336456298828125, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8722853660583496, "num_tokens": 524291090.0, "step": 13745 }, { "epoch": 1.7486324895051522, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.930355072021484, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.874261736869812, "num_tokens": 524324882.0, "step": 13746 }, { "epoch": 1.7487596997837427, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.38343048095703, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8429746031761169, "num_tokens": 524359422.0, "step": 13747 }, { "epoch": 1.748886910062333, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.1715087890625, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.865503191947937, "num_tokens": 524398383.0, "step": 13748 }, { "epoch": 1.7490141203409235, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.693138122558594, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8582650423049927, "num_tokens": 524437984.0, "step": 13749 }, { "epoch": 1.749141330619514, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.8574104309082, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8676115274429321, "num_tokens": 524471324.0, "step": 13750 }, { "epoch": 1.7492685408981046, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.030330657958984, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8516228199005127, "num_tokens": 524511471.0, "step": 13751 }, { "epoch": 1.749395751176695, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.11434555053711, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.864545464515686, "num_tokens": 524541743.0, "step": 13752 }, { "epoch": 1.7495229614552856, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.978355407714844, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8607664108276367, "num_tokens": 524581470.0, "step": 13753 }, { "epoch": 1.749650171733876, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.59735107421875, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8625799417495728, "num_tokens": 524612722.0, "step": 13754 }, { "epoch": 1.7497773820124665, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.606685638427734, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8694331049919128, "num_tokens": 524647640.0, "step": 13755 }, { "epoch": 1.749904592291057, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.8511848449707, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8542778491973877, "num_tokens": 524682366.0, "step": 13756 }, { "epoch": 1.7500318025696475, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.334129333496094, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8611778020858765, "num_tokens": 524724242.0, "step": 13757 }, { "epoch": 1.750159012848238, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.77876281738281, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8462685346603394, "num_tokens": 524757373.0, "step": 13758 }, { "epoch": 1.7502862231268286, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.330108642578125, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8500198125839233, "num_tokens": 524797704.0, "step": 13759 }, { "epoch": 1.750413433405419, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.602115631103516, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8633372783660889, "num_tokens": 524836710.0, "step": 13760 }, { "epoch": 1.7505406436840096, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.316932678222656, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8586055040359497, "num_tokens": 524870224.0, "step": 13761 }, { "epoch": 1.7506678539626002, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.86088180541992, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.863854169845581, "num_tokens": 524905345.0, "step": 13762 }, { "epoch": 1.7507950642411907, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.288818359375, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8442668914794922, "num_tokens": 524943893.0, "step": 13763 }, { "epoch": 1.7509222745197812, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.560035705566406, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8580374717712402, "num_tokens": 524982924.0, "step": 13764 }, { "epoch": 1.7510494847983717, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.74665069580078, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8587942719459534, "num_tokens": 525016749.0, "step": 13765 }, { "epoch": 1.7511766950769623, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.39457321166992, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.862870454788208, "num_tokens": 525049178.0, "step": 13766 }, { "epoch": 1.7513039053555528, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.755306243896484, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8456576466560364, "num_tokens": 525083456.0, "step": 13767 }, { "epoch": 1.7514311156341433, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.12948226928711, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8645989894866943, "num_tokens": 525120159.0, "step": 13768 }, { "epoch": 1.7515583259127339, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.79258346557617, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8649923801422119, "num_tokens": 525159174.0, "step": 13769 }, { "epoch": 1.7516855361913244, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.76956558227539, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8588172197341919, "num_tokens": 525198373.0, "step": 13770 }, { "epoch": 1.751812746469915, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.451377868652344, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8656806945800781, "num_tokens": 525240618.0, "step": 13771 }, { "epoch": 1.7519399567485054, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.85342025756836, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8756609559059143, "num_tokens": 525274899.0, "step": 13772 }, { "epoch": 1.7520671670270958, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.43616485595703, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8404402732849121, "num_tokens": 525310144.0, "step": 13773 }, { "epoch": 1.7521943773056863, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.41613006591797, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8690391778945923, "num_tokens": 525348565.0, "step": 13774 }, { "epoch": 1.7523215875842768, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.500858306884766, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.867013156414032, "num_tokens": 525383709.0, "step": 13775 }, { "epoch": 1.7524487978628673, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.61278533935547, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8541621565818787, "num_tokens": 525423983.0, "step": 13776 }, { "epoch": 1.7525760081414579, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.58400344848633, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8674860000610352, "num_tokens": 525457357.0, "step": 13777 }, { "epoch": 1.7527032184200484, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.928524017333984, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8590950965881348, "num_tokens": 525497021.0, "step": 13778 }, { "epoch": 1.7528304286986387, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.78435516357422, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.876020073890686, "num_tokens": 525533592.0, "step": 13779 }, { "epoch": 1.7529576389772292, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.44497299194336, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8529603481292725, "num_tokens": 525572747.0, "step": 13780 }, { "epoch": 1.7530848492558198, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.944847106933594, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.866058349609375, "num_tokens": 525613578.0, "step": 13781 }, { "epoch": 1.7532120595344103, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.664432525634766, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8679405450820923, "num_tokens": 525655047.0, "step": 13782 }, { "epoch": 1.7533392698130008, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.48503494262695, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8568796515464783, "num_tokens": 525689561.0, "step": 13783 }, { "epoch": 1.7534664800915913, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.682952880859375, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8578240275382996, "num_tokens": 525726946.0, "step": 13784 }, { "epoch": 1.7535936903701819, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.46525955200195, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8533143997192383, "num_tokens": 525762089.0, "step": 13785 }, { "epoch": 1.7537209006487724, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.83175277709961, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8652970194816589, "num_tokens": 525796081.0, "step": 13786 }, { "epoch": 1.753848110927363, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.38445281982422, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8486359119415283, "num_tokens": 525829945.0, "step": 13787 }, { "epoch": 1.7539753212059535, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.79633331298828, "learning_rate": 1e-06, "loss": 0.5332, "mean_token_accuracy": 0.869358241558075, "num_tokens": 525871289.0, "step": 13788 }, { "epoch": 1.754102531484544, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.82426834106445, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8622327446937561, "num_tokens": 525908455.0, "step": 13789 }, { "epoch": 1.7542297417631345, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.48435592651367, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8829766511917114, "num_tokens": 525943734.0, "step": 13790 }, { "epoch": 1.754356952041725, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.84908676147461, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.884395182132721, "num_tokens": 525988070.0, "step": 13791 }, { "epoch": 1.7544841623203156, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.0748176574707, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8768633008003235, "num_tokens": 526027940.0, "step": 13792 }, { "epoch": 1.754611372598906, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.81516647338867, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8715417385101318, "num_tokens": 526061357.0, "step": 13793 }, { "epoch": 1.7547385828774966, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.48774719238281, "learning_rate": 1e-06, "loss": 0.5465, "mean_token_accuracy": 0.8632832765579224, "num_tokens": 526102157.0, "step": 13794 }, { "epoch": 1.7548657931560872, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.53684997558594, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8528061509132385, "num_tokens": 526137241.0, "step": 13795 }, { "epoch": 1.7549930034346777, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.76551818847656, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8529303073883057, "num_tokens": 526169079.0, "step": 13796 }, { "epoch": 1.755120213713268, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.82749938964844, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.866195797920227, "num_tokens": 526212212.0, "step": 13797 }, { "epoch": 1.7552474239918585, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.61054611206055, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8681356310844421, "num_tokens": 526250380.0, "step": 13798 }, { "epoch": 1.755374634270449, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.513641357421875, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8684517741203308, "num_tokens": 526292525.0, "step": 13799 }, { "epoch": 1.7555018445490396, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.96015930175781, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8621380925178528, "num_tokens": 526330427.0, "step": 13800 }, { "epoch": 1.75562905482763, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.37057876586914, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8615043759346008, "num_tokens": 526362954.0, "step": 13801 }, { "epoch": 1.7557562651062206, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.6618537902832, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8543797731399536, "num_tokens": 526394310.0, "step": 13802 }, { "epoch": 1.755883475384811, "ewc_loss": 0.1484375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012683868408203125, "grad_norm": 43.33865737915039, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8416182994842529, "num_tokens": 526438507.0, "step": 13803 }, { "epoch": 1.7560106856634015, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.745384216308594, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8580021858215332, "num_tokens": 526475124.0, "step": 13804 }, { "epoch": 1.756137895941992, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.355525970458984, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.869911789894104, "num_tokens": 526514936.0, "step": 13805 }, { "epoch": 1.7562651062205825, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.724788665771484, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8542050123214722, "num_tokens": 526548649.0, "step": 13806 }, { "epoch": 1.756392316499173, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.59292221069336, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8459965586662292, "num_tokens": 526589028.0, "step": 13807 }, { "epoch": 1.7565195267777636, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.374114990234375, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8622356057167053, "num_tokens": 526626327.0, "step": 13808 }, { "epoch": 1.756646737056354, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.712379455566406, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.857554018497467, "num_tokens": 526672264.0, "step": 13809 }, { "epoch": 1.7567739473349446, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.369667053222656, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8553658127784729, "num_tokens": 526709979.0, "step": 13810 }, { "epoch": 1.7569011576135352, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.416038513183594, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.874167799949646, "num_tokens": 526745002.0, "step": 13811 }, { "epoch": 1.7570283678921257, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.77837371826172, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8770320415496826, "num_tokens": 526785684.0, "step": 13812 }, { "epoch": 1.7571555781707162, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.30482864379883, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.880791187286377, "num_tokens": 526820246.0, "step": 13813 }, { "epoch": 1.7572827884493067, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.79966354370117, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8659358024597168, "num_tokens": 526854815.0, "step": 13814 }, { "epoch": 1.7574099987278973, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.75855255126953, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8670942783355713, "num_tokens": 526896336.0, "step": 13815 }, { "epoch": 1.7575372090064878, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.072689056396484, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8723609447479248, "num_tokens": 526939586.0, "step": 13816 }, { "epoch": 1.7576644192850783, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.65402603149414, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8665659427642822, "num_tokens": 526975027.0, "step": 13817 }, { "epoch": 1.7577916295636689, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.77033996582031, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.863893449306488, "num_tokens": 527017820.0, "step": 13818 }, { "epoch": 1.7579188398422594, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.21713638305664, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8725935220718384, "num_tokens": 527051647.0, "step": 13819 }, { "epoch": 1.75804605012085, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.310245513916016, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8667540550231934, "num_tokens": 527089590.0, "step": 13820 }, { "epoch": 1.7581732603994404, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.26958465576172, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8602021932601929, "num_tokens": 527126696.0, "step": 13821 }, { "epoch": 1.7583004706780307, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.209625244140625, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8526759147644043, "num_tokens": 527164823.0, "step": 13822 }, { "epoch": 1.7584276809566213, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.038818359375, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8438605070114136, "num_tokens": 527208032.0, "step": 13823 }, { "epoch": 1.7585548912352118, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.56417465209961, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.863216757774353, "num_tokens": 527242773.0, "step": 13824 }, { "epoch": 1.7586821015138023, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.0480842590332, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8640267848968506, "num_tokens": 527277968.0, "step": 13825 }, { "epoch": 1.7588093117923929, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.5037727355957, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8738657236099243, "num_tokens": 527311077.0, "step": 13826 }, { "epoch": 1.7589365220709834, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.31599426269531, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8684073686599731, "num_tokens": 527340543.0, "step": 13827 }, { "epoch": 1.7590637323495737, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.44173049926758, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8593990206718445, "num_tokens": 527379596.0, "step": 13828 }, { "epoch": 1.7591909426281642, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.38723373413086, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8610315918922424, "num_tokens": 527417873.0, "step": 13829 }, { "epoch": 1.7593181529067548, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.27297592163086, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8642174005508423, "num_tokens": 527458607.0, "step": 13830 }, { "epoch": 1.7594453631853453, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.56605911254883, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8535938262939453, "num_tokens": 527493299.0, "step": 13831 }, { "epoch": 1.7595725734639358, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.54050064086914, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8499648571014404, "num_tokens": 527536065.0, "step": 13832 }, { "epoch": 1.7596997837425263, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.031333923339844, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8461756706237793, "num_tokens": 527576477.0, "step": 13833 }, { "epoch": 1.7598269940211169, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.38041687011719, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8592841625213623, "num_tokens": 527611777.0, "step": 13834 }, { "epoch": 1.7599542042997074, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.321746826171875, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8784893751144409, "num_tokens": 527650236.0, "step": 13835 }, { "epoch": 1.760081414578298, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.57733154296875, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8662511706352234, "num_tokens": 527685486.0, "step": 13836 }, { "epoch": 1.7602086248568884, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.53703689575195, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8488532304763794, "num_tokens": 527725386.0, "step": 13837 }, { "epoch": 1.760335835135479, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.406959533691406, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8498872518539429, "num_tokens": 527756213.0, "step": 13838 }, { "epoch": 1.7604630454140695, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.247928619384766, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8664000034332275, "num_tokens": 527793884.0, "step": 13839 }, { "epoch": 1.76059025569266, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.55480194091797, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8609848618507385, "num_tokens": 527826428.0, "step": 13840 }, { "epoch": 1.7607174659712506, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.55704879760742, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8543402552604675, "num_tokens": 527866691.0, "step": 13841 }, { "epoch": 1.760844676249841, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.62540054321289, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8712060451507568, "num_tokens": 527909482.0, "step": 13842 }, { "epoch": 1.7609718865284316, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.559749603271484, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8496271371841431, "num_tokens": 527947818.0, "step": 13843 }, { "epoch": 1.7610990968070221, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.91229248046875, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.864553689956665, "num_tokens": 527986760.0, "step": 13844 }, { "epoch": 1.7612263070856127, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.181640625, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.854134738445282, "num_tokens": 528024742.0, "step": 13845 }, { "epoch": 1.761353517364203, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.9563102722168, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8458374738693237, "num_tokens": 528060781.0, "step": 13846 }, { "epoch": 1.7614807276427935, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.8115119934082, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8641575574874878, "num_tokens": 528099051.0, "step": 13847 }, { "epoch": 1.761607937921384, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.87321472167969, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8545336723327637, "num_tokens": 528138325.0, "step": 13848 }, { "epoch": 1.7617351481999746, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.231136322021484, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8524128198623657, "num_tokens": 528177997.0, "step": 13849 }, { "epoch": 1.761862358478565, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.56297302246094, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8588724136352539, "num_tokens": 528216360.0, "step": 13850 }, { "epoch": 1.7619895687571556, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.56780242919922, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8766289353370667, "num_tokens": 528251100.0, "step": 13851 }, { "epoch": 1.762116779035746, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.763031005859375, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8657820224761963, "num_tokens": 528286298.0, "step": 13852 }, { "epoch": 1.7622439893143365, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.37508010864258, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.878533124923706, "num_tokens": 528322970.0, "step": 13853 }, { "epoch": 1.762371199592927, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.91694641113281, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8790146708488464, "num_tokens": 528355191.0, "step": 13854 }, { "epoch": 1.7624984098715175, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.4100456237793, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.862402081489563, "num_tokens": 528388975.0, "step": 13855 }, { "epoch": 1.762625620150108, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.51980972290039, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8655615448951721, "num_tokens": 528432139.0, "step": 13856 }, { "epoch": 1.7627528304286986, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.773597717285156, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8710581064224243, "num_tokens": 528468315.0, "step": 13857 }, { "epoch": 1.762880040707289, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.49924850463867, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8509387969970703, "num_tokens": 528510368.0, "step": 13858 }, { "epoch": 1.7630072509858796, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.71036148071289, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.86399245262146, "num_tokens": 528545637.0, "step": 13859 }, { "epoch": 1.7631344612644702, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.345333099365234, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8653106689453125, "num_tokens": 528586267.0, "step": 13860 }, { "epoch": 1.7632616715430607, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.84957504272461, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8663808703422546, "num_tokens": 528629640.0, "step": 13861 }, { "epoch": 1.7633888818216512, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.97068405151367, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8472418785095215, "num_tokens": 528665373.0, "step": 13862 }, { "epoch": 1.7635160921002417, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.905635833740234, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.877345621585846, "num_tokens": 528703426.0, "step": 13863 }, { "epoch": 1.7636433023788323, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.96912384033203, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.843389630317688, "num_tokens": 528744886.0, "step": 13864 }, { "epoch": 1.7637705126574228, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.61031723022461, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8744153380393982, "num_tokens": 528779773.0, "step": 13865 }, { "epoch": 1.7638977229360133, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.01298141479492, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8653638958930969, "num_tokens": 528821878.0, "step": 13866 }, { "epoch": 1.7640249332146039, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.63439178466797, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8620706796646118, "num_tokens": 528858056.0, "step": 13867 }, { "epoch": 1.7641521434931944, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.27511978149414, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8581774234771729, "num_tokens": 528903116.0, "step": 13868 }, { "epoch": 1.764279353771785, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.36983108520508, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8552541136741638, "num_tokens": 528941561.0, "step": 13869 }, { "epoch": 1.7644065640503754, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.7020263671875, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8786749839782715, "num_tokens": 528976215.0, "step": 13870 }, { "epoch": 1.7645337743289657, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.110836029052734, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8688178062438965, "num_tokens": 529019280.0, "step": 13871 }, { "epoch": 1.7646609846075563, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.699771881103516, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8660702705383301, "num_tokens": 529060733.0, "step": 13872 }, { "epoch": 1.7647881948861468, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.0123405456543, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8638087511062622, "num_tokens": 529094948.0, "step": 13873 }, { "epoch": 1.7649154051647373, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.48479080200195, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8748725652694702, "num_tokens": 529131374.0, "step": 13874 }, { "epoch": 1.7650426154433279, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.11012649536133, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8577010631561279, "num_tokens": 529169133.0, "step": 13875 }, { "epoch": 1.7651698257219184, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.49504089355469, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8609562516212463, "num_tokens": 529206467.0, "step": 13876 }, { "epoch": 1.7652970360005087, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 42.75299072265625, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8525906205177307, "num_tokens": 529244891.0, "step": 13877 }, { "epoch": 1.7654242462790992, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.013301849365234, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8625577688217163, "num_tokens": 529277384.0, "step": 13878 }, { "epoch": 1.7655514565576897, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.79935073852539, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8658419251441956, "num_tokens": 529322530.0, "step": 13879 }, { "epoch": 1.7656786668362803, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.060150146484375, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8574289083480835, "num_tokens": 529356345.0, "step": 13880 }, { "epoch": 1.7658058771148708, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.00307083129883, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8638962507247925, "num_tokens": 529391877.0, "step": 13881 }, { "epoch": 1.7659330873934613, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.97347640991211, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8689026832580566, "num_tokens": 529425460.0, "step": 13882 }, { "epoch": 1.7660602976720519, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.36015319824219, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8527387380599976, "num_tokens": 529459660.0, "step": 13883 }, { "epoch": 1.7661875079506424, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.61402130126953, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8548465371131897, "num_tokens": 529501839.0, "step": 13884 }, { "epoch": 1.766314718229233, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.634559631347656, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8602384924888611, "num_tokens": 529540828.0, "step": 13885 }, { "epoch": 1.7664419285078234, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.60093307495117, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8611922264099121, "num_tokens": 529581578.0, "step": 13886 }, { "epoch": 1.766569138786414, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.27958297729492, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8617722988128662, "num_tokens": 529624363.0, "step": 13887 }, { "epoch": 1.7666963490650045, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.84186553955078, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8693066835403442, "num_tokens": 529665286.0, "step": 13888 }, { "epoch": 1.766823559343595, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.82603454589844, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.857581377029419, "num_tokens": 529704150.0, "step": 13889 }, { "epoch": 1.7669507696221856, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.40269088745117, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8640657067298889, "num_tokens": 529742110.0, "step": 13890 }, { "epoch": 1.767077979900776, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 42.76434326171875, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8607816100120544, "num_tokens": 529776158.0, "step": 13891 }, { "epoch": 1.7672051901793666, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.822086334228516, "learning_rate": 1e-06, "loss": 0.6296, "mean_token_accuracy": 0.8394006490707397, "num_tokens": 529814002.0, "step": 13892 }, { "epoch": 1.7673324004579571, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.13776397705078, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8645355701446533, "num_tokens": 529846696.0, "step": 13893 }, { "epoch": 1.7674596107365477, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.986698150634766, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8586288690567017, "num_tokens": 529881557.0, "step": 13894 }, { "epoch": 1.767586821015138, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.14714050292969, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.848115086555481, "num_tokens": 529924337.0, "step": 13895 }, { "epoch": 1.7677140312937285, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.669368743896484, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8590106964111328, "num_tokens": 529967195.0, "step": 13896 }, { "epoch": 1.767841241572319, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.262596130371094, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8431573510169983, "num_tokens": 530003454.0, "step": 13897 }, { "epoch": 1.7679684518509096, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.92694091796875, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8352522850036621, "num_tokens": 530042577.0, "step": 13898 }, { "epoch": 1.7680956621295, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.22423553466797, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.84522545337677, "num_tokens": 530084903.0, "step": 13899 }, { "epoch": 1.7682228724080906, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.742923736572266, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8578846454620361, "num_tokens": 530117598.0, "step": 13900 }, { "epoch": 1.768350082686681, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.399356842041016, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8651494979858398, "num_tokens": 530151891.0, "step": 13901 }, { "epoch": 1.7684772929652715, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.838382720947266, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8526332378387451, "num_tokens": 530191263.0, "step": 13902 }, { "epoch": 1.768604503243862, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.300743103027344, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8571202158927917, "num_tokens": 530231156.0, "step": 13903 }, { "epoch": 1.7687317135224525, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.53526306152344, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8741281032562256, "num_tokens": 530270089.0, "step": 13904 }, { "epoch": 1.768858923801043, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.537593841552734, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8642604351043701, "num_tokens": 530309808.0, "step": 13905 }, { "epoch": 1.7689861340796336, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.55618667602539, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8485606908798218, "num_tokens": 530357723.0, "step": 13906 }, { "epoch": 1.769113344358224, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.3426399230957, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8496132493019104, "num_tokens": 530389586.0, "step": 13907 }, { "epoch": 1.7692405546368146, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.51166915893555, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8516247868537903, "num_tokens": 530430356.0, "step": 13908 }, { "epoch": 1.7693677649154052, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.219581604003906, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8665884733200073, "num_tokens": 530472859.0, "step": 13909 }, { "epoch": 1.7694949751939957, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.633872985839844, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.848595142364502, "num_tokens": 530515422.0, "step": 13910 }, { "epoch": 1.7696221854725862, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.721500396728516, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.86025470495224, "num_tokens": 530553842.0, "step": 13911 }, { "epoch": 1.7697493957511767, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.392093658447266, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.859886109828949, "num_tokens": 530592593.0, "step": 13912 }, { "epoch": 1.7698766060297673, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.025901794433594, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8557636737823486, "num_tokens": 530632309.0, "step": 13913 }, { "epoch": 1.7700038163083578, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.561763763427734, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8558506369590759, "num_tokens": 530667415.0, "step": 13914 }, { "epoch": 1.7701310265869483, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.50535583496094, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8582985401153564, "num_tokens": 530705736.0, "step": 13915 }, { "epoch": 1.7702582368655388, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.49212646484375, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8695173263549805, "num_tokens": 530744931.0, "step": 13916 }, { "epoch": 1.7703854471441294, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.474178314208984, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8734636306762695, "num_tokens": 530784700.0, "step": 13917 }, { "epoch": 1.77051265742272, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.70218276977539, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8767518401145935, "num_tokens": 530821097.0, "step": 13918 }, { "epoch": 1.7706398677013102, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.6987419128418, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8552602529525757, "num_tokens": 530860444.0, "step": 13919 }, { "epoch": 1.7707670779799007, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.00360870361328, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8578276038169861, "num_tokens": 530907304.0, "step": 13920 }, { "epoch": 1.7708942882584913, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.89900588989258, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8519396781921387, "num_tokens": 530945692.0, "step": 13921 }, { "epoch": 1.7710214985370818, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.66819763183594, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8701334595680237, "num_tokens": 530986695.0, "step": 13922 }, { "epoch": 1.7711487088156723, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.979637145996094, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8720676898956299, "num_tokens": 531022656.0, "step": 13923 }, { "epoch": 1.7712759190942629, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.90589141845703, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8696566820144653, "num_tokens": 531056896.0, "step": 13924 }, { "epoch": 1.7714031293728534, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.30385208129883, "learning_rate": 1e-06, "loss": 0.6485, "mean_token_accuracy": 0.8347036838531494, "num_tokens": 531097286.0, "step": 13925 }, { "epoch": 1.7715303396514437, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.96272659301758, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.868316113948822, "num_tokens": 531136430.0, "step": 13926 }, { "epoch": 1.7716575499300342, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.2425537109375, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8517808318138123, "num_tokens": 531176698.0, "step": 13927 }, { "epoch": 1.7717847602086247, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.77885818481445, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8588537573814392, "num_tokens": 531214343.0, "step": 13928 }, { "epoch": 1.7719119704872153, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.12961196899414, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.872904896736145, "num_tokens": 531252325.0, "step": 13929 }, { "epoch": 1.7720391807658058, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.11779022216797, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8474578857421875, "num_tokens": 531290378.0, "step": 13930 }, { "epoch": 1.7721663910443963, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.083683013916016, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8495652675628662, "num_tokens": 531330406.0, "step": 13931 }, { "epoch": 1.7722936013229869, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.066375732421875, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8589885234832764, "num_tokens": 531367425.0, "step": 13932 }, { "epoch": 1.7724208116015774, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.16728973388672, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8806077241897583, "num_tokens": 531406465.0, "step": 13933 }, { "epoch": 1.772548021880168, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.92793273925781, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8486250638961792, "num_tokens": 531445356.0, "step": 13934 }, { "epoch": 1.7726752321587584, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.81319046020508, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8582357168197632, "num_tokens": 531484737.0, "step": 13935 }, { "epoch": 1.772802442437349, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.727237701416016, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8584409952163696, "num_tokens": 531523453.0, "step": 13936 }, { "epoch": 1.7729296527159395, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.08272171020508, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8680838942527771, "num_tokens": 531559627.0, "step": 13937 }, { "epoch": 1.77305686299453, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.11155319213867, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8653664588928223, "num_tokens": 531597677.0, "step": 13938 }, { "epoch": 1.7731840732731206, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.32398223876953, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8690118193626404, "num_tokens": 531642753.0, "step": 13939 }, { "epoch": 1.773311283551711, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.47819519042969, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8532693386077881, "num_tokens": 531688249.0, "step": 13940 }, { "epoch": 1.7734384938303016, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.7513427734375, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8757306337356567, "num_tokens": 531727994.0, "step": 13941 }, { "epoch": 1.7735657041088921, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.144718170166016, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8631253838539124, "num_tokens": 531764428.0, "step": 13942 }, { "epoch": 1.7736929143874827, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.87894821166992, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8564460277557373, "num_tokens": 531806604.0, "step": 13943 }, { "epoch": 1.773820124666073, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.26451110839844, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8783107399940491, "num_tokens": 531845504.0, "step": 13944 }, { "epoch": 1.7739473349446635, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.754127502441406, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8684718608856201, "num_tokens": 531880962.0, "step": 13945 }, { "epoch": 1.774074545223254, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.46012878417969, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8581047058105469, "num_tokens": 531918182.0, "step": 13946 }, { "epoch": 1.7742017555018446, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.994842529296875, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8623523712158203, "num_tokens": 531958340.0, "step": 13947 }, { "epoch": 1.774328965780435, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.42988586425781, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8484772443771362, "num_tokens": 531996951.0, "step": 13948 }, { "epoch": 1.7744561760590256, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.56724166870117, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8642644882202148, "num_tokens": 532036963.0, "step": 13949 }, { "epoch": 1.774583386337616, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.581058502197266, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8536268472671509, "num_tokens": 532070937.0, "step": 13950 }, { "epoch": 1.7747105966162064, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.738101959228516, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8464291095733643, "num_tokens": 532107866.0, "step": 13951 }, { "epoch": 1.774837806894797, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.98389434814453, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8540168404579163, "num_tokens": 532151485.0, "step": 13952 }, { "epoch": 1.7749650171733875, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.2808723449707, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.867671012878418, "num_tokens": 532191209.0, "step": 13953 }, { "epoch": 1.775092227451978, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.19967269897461, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8540804386138916, "num_tokens": 532232219.0, "step": 13954 }, { "epoch": 1.7752194377305686, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.44276428222656, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8501119017601013, "num_tokens": 532265330.0, "step": 13955 }, { "epoch": 1.775346648009159, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.773338317871094, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8571633696556091, "num_tokens": 532295964.0, "step": 13956 }, { "epoch": 1.7754738582877496, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.63002395629883, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8636813759803772, "num_tokens": 532333804.0, "step": 13957 }, { "epoch": 1.7756010685663401, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.81745529174805, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.865225613117218, "num_tokens": 532373088.0, "step": 13958 }, { "epoch": 1.7757282788449307, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.73276901245117, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8572173118591309, "num_tokens": 532409473.0, "step": 13959 }, { "epoch": 1.7758554891235212, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.54954528808594, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8730312585830688, "num_tokens": 532447883.0, "step": 13960 }, { "epoch": 1.7759826994021117, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.6721305847168, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8651866912841797, "num_tokens": 532485783.0, "step": 13961 }, { "epoch": 1.7761099096807023, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.52867126464844, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8664944171905518, "num_tokens": 532511082.0, "step": 13962 }, { "epoch": 1.7762371199592928, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.85057830810547, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.879091739654541, "num_tokens": 532555516.0, "step": 13963 }, { "epoch": 1.7763643302378833, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.565528869628906, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.845366358757019, "num_tokens": 532599200.0, "step": 13964 }, { "epoch": 1.7764915405164738, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.52200698852539, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8699589371681213, "num_tokens": 532636335.0, "step": 13965 }, { "epoch": 1.7766187507950644, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.42074203491211, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8728947043418884, "num_tokens": 532678337.0, "step": 13966 }, { "epoch": 1.776745961073655, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.785499572753906, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8570950627326965, "num_tokens": 532715051.0, "step": 13967 }, { "epoch": 1.7768731713522452, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.296417236328125, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8708274960517883, "num_tokens": 532750836.0, "step": 13968 }, { "epoch": 1.7770003816308357, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.444149017333984, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8667317628860474, "num_tokens": 532790331.0, "step": 13969 }, { "epoch": 1.7771275919094263, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.4001579284668, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.856410562992096, "num_tokens": 532830169.0, "step": 13970 }, { "epoch": 1.7772548021880168, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.8226432800293, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8550012111663818, "num_tokens": 532865957.0, "step": 13971 }, { "epoch": 1.7773820124666073, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.33347702026367, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.857099711894989, "num_tokens": 532906112.0, "step": 13972 }, { "epoch": 1.7775092227451978, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.650726318359375, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8683148622512817, "num_tokens": 532939676.0, "step": 13973 }, { "epoch": 1.7776364330237884, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.39690399169922, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8426499366760254, "num_tokens": 532977668.0, "step": 13974 }, { "epoch": 1.7777636433023787, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.34480667114258, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8581176996231079, "num_tokens": 533013862.0, "step": 13975 }, { "epoch": 1.7778908535809692, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.57158279418945, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8500410914421082, "num_tokens": 533051222.0, "step": 13976 }, { "epoch": 1.7780180638595597, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.10051345825195, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8584351539611816, "num_tokens": 533090182.0, "step": 13977 }, { "epoch": 1.7781452741381503, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.60300827026367, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8739204406738281, "num_tokens": 533129600.0, "step": 13978 }, { "epoch": 1.7782724844167408, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.172203063964844, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.860764741897583, "num_tokens": 533172351.0, "step": 13979 }, { "epoch": 1.7783996946953313, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.50776672363281, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8575425744056702, "num_tokens": 533211583.0, "step": 13980 }, { "epoch": 1.7785269049739219, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.09917068481445, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8668190240859985, "num_tokens": 533249960.0, "step": 13981 }, { "epoch": 1.7786541152525124, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.8087043762207, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.866895318031311, "num_tokens": 533287328.0, "step": 13982 }, { "epoch": 1.778781325531103, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.46387481689453, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8625165224075317, "num_tokens": 533327772.0, "step": 13983 }, { "epoch": 1.7789085358096934, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.753936767578125, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.854498028755188, "num_tokens": 533372004.0, "step": 13984 }, { "epoch": 1.779035746088284, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.43083953857422, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8567178249359131, "num_tokens": 533409502.0, "step": 13985 }, { "epoch": 1.7791629563668745, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.826072692871094, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8483878374099731, "num_tokens": 533451160.0, "step": 13986 }, { "epoch": 1.779290166645465, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.40675354003906, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8684667348861694, "num_tokens": 533480413.0, "step": 13987 }, { "epoch": 1.7794173769240555, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.61654281616211, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8491322994232178, "num_tokens": 533518826.0, "step": 13988 }, { "epoch": 1.779544587202646, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.351383209228516, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8498569130897522, "num_tokens": 533563760.0, "step": 13989 }, { "epoch": 1.7796717974812366, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.680076599121094, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8592308759689331, "num_tokens": 533601499.0, "step": 13990 }, { "epoch": 1.7797990077598271, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.83119583129883, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8648606538772583, "num_tokens": 533640757.0, "step": 13991 }, { "epoch": 1.7799262180384177, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.11109161376953, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8635773062705994, "num_tokens": 533678047.0, "step": 13992 }, { "epoch": 1.780053428317008, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.902225494384766, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8612701892852783, "num_tokens": 533720903.0, "step": 13993 }, { "epoch": 1.7801806385955985, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.23195266723633, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8621824979782104, "num_tokens": 533759720.0, "step": 13994 }, { "epoch": 1.780307848874189, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.65436553955078, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.84744793176651, "num_tokens": 533796122.0, "step": 13995 }, { "epoch": 1.7804350591527796, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.53300094604492, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.86759352684021, "num_tokens": 533836326.0, "step": 13996 }, { "epoch": 1.78056226943137, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.474220275878906, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8711845278739929, "num_tokens": 533867874.0, "step": 13997 }, { "epoch": 1.7806894797099606, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.251060485839844, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8721067309379578, "num_tokens": 533902877.0, "step": 13998 }, { "epoch": 1.780816689988551, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.922977447509766, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8651300072669983, "num_tokens": 533937440.0, "step": 13999 }, { "epoch": 1.7809439002671414, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.11038589477539, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.870018720626831, "num_tokens": 533972308.0, "step": 14000 }, { "epoch": 1.781071110545732, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.73328399658203, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.847707211971283, "num_tokens": 534011913.0, "step": 14001 }, { "epoch": 1.7811983208243225, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.528419494628906, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.873139500617981, "num_tokens": 534057235.0, "step": 14002 }, { "epoch": 1.781325531102913, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.7881965637207, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8620694875717163, "num_tokens": 534092850.0, "step": 14003 }, { "epoch": 1.7814527413815036, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.520721435546875, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8663432002067566, "num_tokens": 534135310.0, "step": 14004 }, { "epoch": 1.781579951660094, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.9175910949707, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8583241701126099, "num_tokens": 534176008.0, "step": 14005 }, { "epoch": 1.7817071619386846, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.70813751220703, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8609437942504883, "num_tokens": 534208217.0, "step": 14006 }, { "epoch": 1.7818343722172751, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.62434387207031, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8643996715545654, "num_tokens": 534247564.0, "step": 14007 }, { "epoch": 1.7819615824958657, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.75518035888672, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8436886668205261, "num_tokens": 534281144.0, "step": 14008 }, { "epoch": 1.7820887927744562, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.32530212402344, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8494879603385925, "num_tokens": 534317893.0, "step": 14009 }, { "epoch": 1.7822160030530467, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.990047454833984, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.85798180103302, "num_tokens": 534356116.0, "step": 14010 }, { "epoch": 1.7823432133316373, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.147216796875, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8343789577484131, "num_tokens": 534390499.0, "step": 14011 }, { "epoch": 1.7824704236102278, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.552772521972656, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8558557033538818, "num_tokens": 534424242.0, "step": 14012 }, { "epoch": 1.7825976338888183, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.25003433227539, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8521544337272644, "num_tokens": 534467465.0, "step": 14013 }, { "epoch": 1.7827248441674088, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.41537094116211, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8540446758270264, "num_tokens": 534502959.0, "step": 14014 }, { "epoch": 1.7828520544459994, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.41326904296875, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8622022867202759, "num_tokens": 534549888.0, "step": 14015 }, { "epoch": 1.78297926472459, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.59621810913086, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8623966574668884, "num_tokens": 534591686.0, "step": 14016 }, { "epoch": 1.7831064750031802, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.21152114868164, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.877173900604248, "num_tokens": 534628584.0, "step": 14017 }, { "epoch": 1.7832336852817707, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.73701477050781, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8484526872634888, "num_tokens": 534665684.0, "step": 14018 }, { "epoch": 1.7833608955603613, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 42.827754974365234, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8632676601409912, "num_tokens": 534701604.0, "step": 14019 }, { "epoch": 1.7834881058389518, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.09929656982422, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8448330163955688, "num_tokens": 534741765.0, "step": 14020 }, { "epoch": 1.7836153161175423, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 42.94871520996094, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8696528673171997, "num_tokens": 534780331.0, "step": 14021 }, { "epoch": 1.7837425263961328, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.74357986450195, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8478976488113403, "num_tokens": 534816254.0, "step": 14022 }, { "epoch": 1.7838697366747234, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.49119186401367, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8583147525787354, "num_tokens": 534851367.0, "step": 14023 }, { "epoch": 1.7839969469533137, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.33202362060547, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8584403991699219, "num_tokens": 534889077.0, "step": 14024 }, { "epoch": 1.7841241572319042, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.185752868652344, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8502025604248047, "num_tokens": 534924249.0, "step": 14025 }, { "epoch": 1.7842513675104947, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.402896881103516, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8450835347175598, "num_tokens": 534966370.0, "step": 14026 }, { "epoch": 1.7843785777890853, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.30634307861328, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8670885562896729, "num_tokens": 535005869.0, "step": 14027 }, { "epoch": 1.7845057880676758, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.534934997558594, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8569812774658203, "num_tokens": 535042057.0, "step": 14028 }, { "epoch": 1.7846329983462663, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.44626235961914, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8547290563583374, "num_tokens": 535074410.0, "step": 14029 }, { "epoch": 1.7847602086248568, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.04883575439453, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8512039184570312, "num_tokens": 535120356.0, "step": 14030 }, { "epoch": 1.7848874189034474, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.25486373901367, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8492119908332825, "num_tokens": 535156939.0, "step": 14031 }, { "epoch": 1.785014629182038, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.6507568359375, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8627413511276245, "num_tokens": 535195553.0, "step": 14032 }, { "epoch": 1.7851418394606284, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.648433685302734, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8657792210578918, "num_tokens": 535235692.0, "step": 14033 }, { "epoch": 1.785269049739219, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.1324577331543, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8583489656448364, "num_tokens": 535269265.0, "step": 14034 }, { "epoch": 1.7853962600178095, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.191043853759766, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8751785755157471, "num_tokens": 535303825.0, "step": 14035 }, { "epoch": 1.7855234702964, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.91392517089844, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8539111614227295, "num_tokens": 535338802.0, "step": 14036 }, { "epoch": 1.7856506805749905, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.35285186767578, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8666384220123291, "num_tokens": 535376592.0, "step": 14037 }, { "epoch": 1.785777890853581, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.714351654052734, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8565511703491211, "num_tokens": 535416528.0, "step": 14038 }, { "epoch": 1.7859051011321716, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.64911651611328, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8559253215789795, "num_tokens": 535450892.0, "step": 14039 }, { "epoch": 1.7860323114107621, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.08773422241211, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8615926504135132, "num_tokens": 535488198.0, "step": 14040 }, { "epoch": 1.7861595216893527, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.946781158447266, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8580421209335327, "num_tokens": 535526020.0, "step": 14041 }, { "epoch": 1.786286731967943, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.20348358154297, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.864334225654602, "num_tokens": 535560400.0, "step": 14042 }, { "epoch": 1.7864139422465335, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.493186950683594, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8524309396743774, "num_tokens": 535594699.0, "step": 14043 }, { "epoch": 1.786541152525124, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 42.80394744873047, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8650836944580078, "num_tokens": 535627019.0, "step": 14044 }, { "epoch": 1.7866683628037145, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.68234634399414, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8697009682655334, "num_tokens": 535667479.0, "step": 14045 }, { "epoch": 1.786795573082305, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.39686584472656, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.853631317615509, "num_tokens": 535705211.0, "step": 14046 }, { "epoch": 1.7869227833608956, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.71065139770508, "learning_rate": 1e-06, "loss": 0.6216, "mean_token_accuracy": 0.8399842977523804, "num_tokens": 535744907.0, "step": 14047 }, { "epoch": 1.787049993639486, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.382022857666016, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8580077886581421, "num_tokens": 535782526.0, "step": 14048 }, { "epoch": 1.7871772039180764, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.72999572753906, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8754722476005554, "num_tokens": 535824153.0, "step": 14049 }, { "epoch": 1.787304414196667, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.67277526855469, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8718191981315613, "num_tokens": 535859019.0, "step": 14050 }, { "epoch": 1.7874316244752575, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.5868034362793, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8726181387901306, "num_tokens": 535896866.0, "step": 14051 }, { "epoch": 1.787558834753848, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.705814361572266, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8615577816963196, "num_tokens": 535937128.0, "step": 14052 }, { "epoch": 1.7876860450324386, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.79127502441406, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8763412833213806, "num_tokens": 535972727.0, "step": 14053 }, { "epoch": 1.787813255311029, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.21504592895508, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8657845854759216, "num_tokens": 536011491.0, "step": 14054 }, { "epoch": 1.7879404655896196, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.54038619995117, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8644434809684753, "num_tokens": 536048074.0, "step": 14055 }, { "epoch": 1.7880676758682101, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.68902587890625, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8679310083389282, "num_tokens": 536089554.0, "step": 14056 }, { "epoch": 1.7881948861468007, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.073909759521484, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.870336651802063, "num_tokens": 536127672.0, "step": 14057 }, { "epoch": 1.7883220964253912, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.68268585205078, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8609811663627625, "num_tokens": 536157517.0, "step": 14058 }, { "epoch": 1.7884493067039817, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.414581298828125, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8745859265327454, "num_tokens": 536192855.0, "step": 14059 }, { "epoch": 1.7885765169825723, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.17683410644531, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8618618249893188, "num_tokens": 536231727.0, "step": 14060 }, { "epoch": 1.7887037272611628, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.2796516418457, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8738510012626648, "num_tokens": 536270738.0, "step": 14061 }, { "epoch": 1.7888309375397533, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.44562530517578, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8797576427459717, "num_tokens": 536313006.0, "step": 14062 }, { "epoch": 1.7889581478183438, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.83994674682617, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.854668140411377, "num_tokens": 536350697.0, "step": 14063 }, { "epoch": 1.7890853580969344, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.08356857299805, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8694996237754822, "num_tokens": 536394252.0, "step": 14064 }, { "epoch": 1.789212568375525, "ewc_loss": 0.1494140625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001277923583984375, "grad_norm": 43.34849548339844, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8573542833328247, "num_tokens": 536426591.0, "step": 14065 }, { "epoch": 1.7893397786541152, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.71029281616211, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8489665985107422, "num_tokens": 536468567.0, "step": 14066 }, { "epoch": 1.7894669889327057, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.59385299682617, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8512495160102844, "num_tokens": 536506352.0, "step": 14067 }, { "epoch": 1.7895941992112963, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.680171966552734, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8553618788719177, "num_tokens": 536538122.0, "step": 14068 }, { "epoch": 1.7897214094898868, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.684635162353516, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.852992594242096, "num_tokens": 536574617.0, "step": 14069 }, { "epoch": 1.7898486197684773, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.71607208251953, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.87340247631073, "num_tokens": 536611895.0, "step": 14070 }, { "epoch": 1.7899758300470678, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.45953369140625, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8538069725036621, "num_tokens": 536651336.0, "step": 14071 }, { "epoch": 1.7901030403256584, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.80298614501953, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8562239408493042, "num_tokens": 536692566.0, "step": 14072 }, { "epoch": 1.7902302506042487, "ewc_loss": 0.150390625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012874603271484375, "grad_norm": 43.36857986450195, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8675804138183594, "num_tokens": 536730855.0, "step": 14073 }, { "epoch": 1.7903574608828392, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.054325103759766, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8576162457466125, "num_tokens": 536766478.0, "step": 14074 }, { "epoch": 1.7904846711614297, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.449241638183594, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8539096713066101, "num_tokens": 536799900.0, "step": 14075 }, { "epoch": 1.7906118814400203, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.86612319946289, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8675650358200073, "num_tokens": 536843597.0, "step": 14076 }, { "epoch": 1.7907390917186108, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.62399673461914, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8723920583724976, "num_tokens": 536879189.0, "step": 14077 }, { "epoch": 1.7908663019972013, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.69589614868164, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8667495250701904, "num_tokens": 536918099.0, "step": 14078 }, { "epoch": 1.7909935122757918, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.85902786254883, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8527313470840454, "num_tokens": 536954410.0, "step": 14079 }, { "epoch": 1.7911207225543824, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.448974609375, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.882866621017456, "num_tokens": 536986688.0, "step": 14080 }, { "epoch": 1.791247932832973, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.48493194580078, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8629037737846375, "num_tokens": 537024181.0, "step": 14081 }, { "epoch": 1.7913751431115634, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.97569274902344, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8652110695838928, "num_tokens": 537064174.0, "step": 14082 }, { "epoch": 1.791502353390154, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.94413757324219, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8538179993629456, "num_tokens": 537100279.0, "step": 14083 }, { "epoch": 1.7916295636687445, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.463436126708984, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8362346887588501, "num_tokens": 537135339.0, "step": 14084 }, { "epoch": 1.791756773947335, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.94269561767578, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8556616306304932, "num_tokens": 537169608.0, "step": 14085 }, { "epoch": 1.7918839842259255, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.52230453491211, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8682347536087036, "num_tokens": 537205560.0, "step": 14086 }, { "epoch": 1.792011194504516, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.007938385009766, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8493914604187012, "num_tokens": 537238697.0, "step": 14087 }, { "epoch": 1.7921384047831066, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.48302459716797, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8556708097457886, "num_tokens": 537282365.0, "step": 14088 }, { "epoch": 1.7922656150616971, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.7899055480957, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8674635291099548, "num_tokens": 537320805.0, "step": 14089 }, { "epoch": 1.7923928253402877, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.818199157714844, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8638390302658081, "num_tokens": 537354607.0, "step": 14090 }, { "epoch": 1.792520035618878, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.59938430786133, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8718780279159546, "num_tokens": 537386672.0, "step": 14091 }, { "epoch": 1.7926472458974685, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.58833312988281, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8635729551315308, "num_tokens": 537430882.0, "step": 14092 }, { "epoch": 1.792774456176059, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.69612503051758, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8639102578163147, "num_tokens": 537465114.0, "step": 14093 }, { "epoch": 1.7929016664546495, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.336063385009766, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8553225994110107, "num_tokens": 537502820.0, "step": 14094 }, { "epoch": 1.79302887673324, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.70725631713867, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8686391115188599, "num_tokens": 537537422.0, "step": 14095 }, { "epoch": 1.7931560870118306, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.64265441894531, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8639774918556213, "num_tokens": 537566828.0, "step": 14096 }, { "epoch": 1.793283297290421, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.301570892333984, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8586382865905762, "num_tokens": 537606102.0, "step": 14097 }, { "epoch": 1.7934105075690114, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.929107666015625, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8681702613830566, "num_tokens": 537638527.0, "step": 14098 }, { "epoch": 1.793537717847602, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.475257873535156, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8599205017089844, "num_tokens": 537683375.0, "step": 14099 }, { "epoch": 1.7936649281261925, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.595829010009766, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8568191528320312, "num_tokens": 537731354.0, "step": 14100 }, { "epoch": 1.793792138404783, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.67327117919922, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8560555577278137, "num_tokens": 537770055.0, "step": 14101 }, { "epoch": 1.7939193486833735, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.19766616821289, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8409311175346375, "num_tokens": 537807977.0, "step": 14102 }, { "epoch": 1.794046558961964, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.29442596435547, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8546169996261597, "num_tokens": 537847809.0, "step": 14103 }, { "epoch": 1.7941737692405546, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.229007720947266, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.842046320438385, "num_tokens": 537885506.0, "step": 14104 }, { "epoch": 1.7943009795191451, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.99584197998047, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8566372394561768, "num_tokens": 537918936.0, "step": 14105 }, { "epoch": 1.7944281897977357, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 42.99509811401367, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8657796382904053, "num_tokens": 537954205.0, "step": 14106 }, { "epoch": 1.7945554000763262, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.170654296875, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8635965585708618, "num_tokens": 537989190.0, "step": 14107 }, { "epoch": 1.7946826103549167, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.0710563659668, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8511573076248169, "num_tokens": 538028852.0, "step": 14108 }, { "epoch": 1.7948098206335072, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.742431640625, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8661074638366699, "num_tokens": 538061245.0, "step": 14109 }, { "epoch": 1.7949370309120978, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.394187927246094, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8739850521087646, "num_tokens": 538103238.0, "step": 14110 }, { "epoch": 1.7950642411906883, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.785552978515625, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8476471304893494, "num_tokens": 538142692.0, "step": 14111 }, { "epoch": 1.7951914514692788, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.10585403442383, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8598207831382751, "num_tokens": 538183155.0, "step": 14112 }, { "epoch": 1.7953186617478694, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.561092376708984, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8594789505004883, "num_tokens": 538222063.0, "step": 14113 }, { "epoch": 1.7954458720264599, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.52602767944336, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8624370098114014, "num_tokens": 538259847.0, "step": 14114 }, { "epoch": 1.7955730823050502, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.4871826171875, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8673810958862305, "num_tokens": 538298557.0, "step": 14115 }, { "epoch": 1.7957002925836407, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.43070602416992, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8586688041687012, "num_tokens": 538337926.0, "step": 14116 }, { "epoch": 1.7958275028622313, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.33283996582031, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.86586993932724, "num_tokens": 538382755.0, "step": 14117 }, { "epoch": 1.7959547131408218, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.33213806152344, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8407109975814819, "num_tokens": 538417639.0, "step": 14118 }, { "epoch": 1.7960819234194123, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.69939422607422, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8556206822395325, "num_tokens": 538458444.0, "step": 14119 }, { "epoch": 1.7962091336980028, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.32762908935547, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8635127544403076, "num_tokens": 538496580.0, "step": 14120 }, { "epoch": 1.7963363439765934, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.605690002441406, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8703035116195679, "num_tokens": 538538474.0, "step": 14121 }, { "epoch": 1.7964635542551837, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.01987838745117, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8631271123886108, "num_tokens": 538576828.0, "step": 14122 }, { "epoch": 1.7965907645337742, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.81576156616211, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.867668092250824, "num_tokens": 538611630.0, "step": 14123 }, { "epoch": 1.7967179748123647, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.1834716796875, "learning_rate": 1e-06, "loss": 0.6545, "mean_token_accuracy": 0.8361427187919617, "num_tokens": 538659537.0, "step": 14124 }, { "epoch": 1.7968451850909553, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.07004165649414, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8703111410140991, "num_tokens": 538700216.0, "step": 14125 }, { "epoch": 1.7969723953695458, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.4048957824707, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8596993684768677, "num_tokens": 538737719.0, "step": 14126 }, { "epoch": 1.7970996056481363, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.553524017333984, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8685865998268127, "num_tokens": 538774723.0, "step": 14127 }, { "epoch": 1.7972268159267268, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.10466766357422, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8702137470245361, "num_tokens": 538815288.0, "step": 14128 }, { "epoch": 1.7973540262053174, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.60455322265625, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8616456985473633, "num_tokens": 538858239.0, "step": 14129 }, { "epoch": 1.797481236483908, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.19523620605469, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.863417387008667, "num_tokens": 538896947.0, "step": 14130 }, { "epoch": 1.7976084467624984, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.07527160644531, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8587580919265747, "num_tokens": 538936511.0, "step": 14131 }, { "epoch": 1.797735657041089, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.242366790771484, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8534447550773621, "num_tokens": 538969198.0, "step": 14132 }, { "epoch": 1.7978628673196795, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.30368423461914, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8713008761405945, "num_tokens": 539002258.0, "step": 14133 }, { "epoch": 1.79799007759827, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.146697998046875, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8474128842353821, "num_tokens": 539039993.0, "step": 14134 }, { "epoch": 1.7981172878768605, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.06277847290039, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8576827049255371, "num_tokens": 539080713.0, "step": 14135 }, { "epoch": 1.798244498155451, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.562156677246094, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.87227463722229, "num_tokens": 539121115.0, "step": 14136 }, { "epoch": 1.7983717084340416, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.81135940551758, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8794536590576172, "num_tokens": 539159889.0, "step": 14137 }, { "epoch": 1.7984989187126321, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.5594482421875, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8610864877700806, "num_tokens": 539201508.0, "step": 14138 }, { "epoch": 1.7986261289912227, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.776920318603516, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8435859680175781, "num_tokens": 539238640.0, "step": 14139 }, { "epoch": 1.798753339269813, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.194297790527344, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8566452264785767, "num_tokens": 539274797.0, "step": 14140 }, { "epoch": 1.7988805495484035, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.57861328125, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8481683731079102, "num_tokens": 539308410.0, "step": 14141 }, { "epoch": 1.799007759826994, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.25083923339844, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8540109992027283, "num_tokens": 539345578.0, "step": 14142 }, { "epoch": 1.7991349701055845, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.623565673828125, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8449670076370239, "num_tokens": 539378946.0, "step": 14143 }, { "epoch": 1.799262180384175, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.05207061767578, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8735146522521973, "num_tokens": 539417663.0, "step": 14144 }, { "epoch": 1.7993893906627656, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.64552688598633, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8761873245239258, "num_tokens": 539459597.0, "step": 14145 }, { "epoch": 1.799516600941356, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.186561584472656, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.864246129989624, "num_tokens": 539499579.0, "step": 14146 }, { "epoch": 1.7996438112199464, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.832923889160156, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.870418906211853, "num_tokens": 539538505.0, "step": 14147 }, { "epoch": 1.799771021498537, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.69102478027344, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8544138669967651, "num_tokens": 539574824.0, "step": 14148 }, { "epoch": 1.7998982317771275, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.235225677490234, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.849307656288147, "num_tokens": 539614181.0, "step": 14149 }, { "epoch": 1.800025442055718, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.785987854003906, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8494899272918701, "num_tokens": 539656713.0, "step": 14150 }, { "epoch": 1.8001526523343085, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.311405181884766, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8596931099891663, "num_tokens": 539695436.0, "step": 14151 }, { "epoch": 1.800279862612899, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.96157455444336, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8620682954788208, "num_tokens": 539736549.0, "step": 14152 }, { "epoch": 1.8004070728914896, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.254486083984375, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.866354763507843, "num_tokens": 539776326.0, "step": 14153 }, { "epoch": 1.8005342831700801, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.51861572265625, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8685339689254761, "num_tokens": 539819665.0, "step": 14154 }, { "epoch": 1.8006614934486707, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.704776763916016, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8605624437332153, "num_tokens": 539858175.0, "step": 14155 }, { "epoch": 1.8007887037272612, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.58707809448242, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8596689701080322, "num_tokens": 539895152.0, "step": 14156 }, { "epoch": 1.8009159140058517, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.39421463012695, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8654814958572388, "num_tokens": 539941033.0, "step": 14157 }, { "epoch": 1.8010431242844422, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.731807708740234, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8630325794219971, "num_tokens": 539977959.0, "step": 14158 }, { "epoch": 1.8011703345630328, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.431800842285156, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8581725358963013, "num_tokens": 540014857.0, "step": 14159 }, { "epoch": 1.8012975448416233, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.7127571105957, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8586703538894653, "num_tokens": 540055775.0, "step": 14160 }, { "epoch": 1.8014247551202138, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.04045486450195, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.869813084602356, "num_tokens": 540090287.0, "step": 14161 }, { "epoch": 1.8015519653988044, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.68437957763672, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8689742088317871, "num_tokens": 540129139.0, "step": 14162 }, { "epoch": 1.8016791756773949, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 42.97481918334961, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.859550416469574, "num_tokens": 540165187.0, "step": 14163 }, { "epoch": 1.8018063859559852, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.88840103149414, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8726094961166382, "num_tokens": 540201113.0, "step": 14164 }, { "epoch": 1.8019335962345757, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.324649810791016, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8625761866569519, "num_tokens": 540237448.0, "step": 14165 }, { "epoch": 1.8020608065131662, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.501834869384766, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8550382256507874, "num_tokens": 540279703.0, "step": 14166 }, { "epoch": 1.8021880167917568, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.7440299987793, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8406542539596558, "num_tokens": 540313071.0, "step": 14167 }, { "epoch": 1.8023152270703473, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.01252746582031, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8655059933662415, "num_tokens": 540353940.0, "step": 14168 }, { "epoch": 1.8024424373489378, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.83575439453125, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.857140064239502, "num_tokens": 540391241.0, "step": 14169 }, { "epoch": 1.8025696476275284, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.43178939819336, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8738489747047424, "num_tokens": 540420205.0, "step": 14170 }, { "epoch": 1.8026968579061187, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.96377182006836, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8639010190963745, "num_tokens": 540452121.0, "step": 14171 }, { "epoch": 1.8028240681847092, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.10816955566406, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8609137535095215, "num_tokens": 540487415.0, "step": 14172 }, { "epoch": 1.8029512784632997, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.65461349487305, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.851357638835907, "num_tokens": 540522293.0, "step": 14173 }, { "epoch": 1.8030784887418903, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.9599494934082, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8715353012084961, "num_tokens": 540560345.0, "step": 14174 }, { "epoch": 1.8032056990204808, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.47675704956055, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8700706958770752, "num_tokens": 540598299.0, "step": 14175 }, { "epoch": 1.8033329092990713, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.17971420288086, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8726922869682312, "num_tokens": 540634480.0, "step": 14176 }, { "epoch": 1.8034601195776618, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.032962799072266, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8525441288948059, "num_tokens": 540679473.0, "step": 14177 }, { "epoch": 1.8035873298562524, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.039222717285156, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8555930256843567, "num_tokens": 540722259.0, "step": 14178 }, { "epoch": 1.803714540134843, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 42.807247161865234, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8650376796722412, "num_tokens": 540758058.0, "step": 14179 }, { "epoch": 1.8038417504134334, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.971534729003906, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8567959070205688, "num_tokens": 540798535.0, "step": 14180 }, { "epoch": 1.803968960692024, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.26897430419922, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8603175282478333, "num_tokens": 540839423.0, "step": 14181 }, { "epoch": 1.8040961709706145, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.79397201538086, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8665396571159363, "num_tokens": 540887512.0, "step": 14182 }, { "epoch": 1.804223381249205, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.44795227050781, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.86379075050354, "num_tokens": 540922292.0, "step": 14183 }, { "epoch": 1.8043505915277955, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.723758697509766, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8679847121238708, "num_tokens": 540964105.0, "step": 14184 }, { "epoch": 1.804477801806386, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 42.997657775878906, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8589773774147034, "num_tokens": 541001793.0, "step": 14185 }, { "epoch": 1.8046050120849766, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.04609298706055, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8666840195655823, "num_tokens": 541041104.0, "step": 14186 }, { "epoch": 1.8047322223635671, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.0074577331543, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8562273979187012, "num_tokens": 541079643.0, "step": 14187 }, { "epoch": 1.8048594326421576, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.214561462402344, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8663679361343384, "num_tokens": 541118554.0, "step": 14188 }, { "epoch": 1.804986642920748, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 42.91107940673828, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8503660559654236, "num_tokens": 541156893.0, "step": 14189 }, { "epoch": 1.8051138531993385, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.06946563720703, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8780068159103394, "num_tokens": 541197283.0, "step": 14190 }, { "epoch": 1.805241063477929, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.37537384033203, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8748564720153809, "num_tokens": 541232343.0, "step": 14191 }, { "epoch": 1.8053682737565195, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.603885650634766, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8526046872138977, "num_tokens": 541270274.0, "step": 14192 }, { "epoch": 1.80549548403511, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.8390998840332, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8548213839530945, "num_tokens": 541306018.0, "step": 14193 }, { "epoch": 1.8056226943137006, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.789180755615234, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8620086908340454, "num_tokens": 541344974.0, "step": 14194 }, { "epoch": 1.805749904592291, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.303462982177734, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8715661764144897, "num_tokens": 541378590.0, "step": 14195 }, { "epoch": 1.8058771148708814, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.06114196777344, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8641908764839172, "num_tokens": 541415945.0, "step": 14196 }, { "epoch": 1.806004325149472, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.57759475708008, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8580982685089111, "num_tokens": 541459577.0, "step": 14197 }, { "epoch": 1.8061315354280625, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.909427642822266, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8614639043807983, "num_tokens": 541501041.0, "step": 14198 }, { "epoch": 1.806258745706653, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.498233795166016, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8544498085975647, "num_tokens": 541538587.0, "step": 14199 }, { "epoch": 1.8063859559852435, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.264225006103516, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8485161662101746, "num_tokens": 541582225.0, "step": 14200 }, { "epoch": 1.806513166263834, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.03982162475586, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8704157471656799, "num_tokens": 541616956.0, "step": 14201 }, { "epoch": 1.8066403765424246, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.538238525390625, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8507193326950073, "num_tokens": 541650814.0, "step": 14202 }, { "epoch": 1.8067675868210151, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.190528869628906, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8718744516372681, "num_tokens": 541692167.0, "step": 14203 }, { "epoch": 1.8068947970996057, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.926761627197266, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8512046337127686, "num_tokens": 541732427.0, "step": 14204 }, { "epoch": 1.8070220073781962, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.60617446899414, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8618471026420593, "num_tokens": 541773256.0, "step": 14205 }, { "epoch": 1.8071492176567867, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.299774169921875, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8670819401741028, "num_tokens": 541815008.0, "step": 14206 }, { "epoch": 1.8072764279353772, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.38799285888672, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8643208742141724, "num_tokens": 541853462.0, "step": 14207 }, { "epoch": 1.8074036382139678, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.86702346801758, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8564914464950562, "num_tokens": 541890487.0, "step": 14208 }, { "epoch": 1.8075308484925583, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.72538375854492, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8665264844894409, "num_tokens": 541929857.0, "step": 14209 }, { "epoch": 1.8076580587711488, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.414207458496094, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8542076349258423, "num_tokens": 541969923.0, "step": 14210 }, { "epoch": 1.8077852690497394, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.608604431152344, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8716974258422852, "num_tokens": 542008802.0, "step": 14211 }, { "epoch": 1.8079124793283299, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.85900115966797, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8749111890792847, "num_tokens": 542049276.0, "step": 14212 }, { "epoch": 1.8080396896069202, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.46601867675781, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8548661470413208, "num_tokens": 542086921.0, "step": 14213 }, { "epoch": 1.8081668998855107, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.82000732421875, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8607591986656189, "num_tokens": 542124291.0, "step": 14214 }, { "epoch": 1.8082941101641012, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.311275482177734, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8504191637039185, "num_tokens": 542167600.0, "step": 14215 }, { "epoch": 1.8084213204426918, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.066532135009766, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8558627367019653, "num_tokens": 542201134.0, "step": 14216 }, { "epoch": 1.8085485307212823, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.08323669433594, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8485212326049805, "num_tokens": 542242590.0, "step": 14217 }, { "epoch": 1.8086757409998728, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.96736145019531, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8604406714439392, "num_tokens": 542281945.0, "step": 14218 }, { "epoch": 1.8088029512784631, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.67917251586914, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8693053722381592, "num_tokens": 542319389.0, "step": 14219 }, { "epoch": 1.8089301615570537, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.87353515625, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8482130765914917, "num_tokens": 542356912.0, "step": 14220 }, { "epoch": 1.8090573718356442, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.908939361572266, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.853934645652771, "num_tokens": 542394054.0, "step": 14221 }, { "epoch": 1.8091845821142347, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.62627410888672, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8671743273735046, "num_tokens": 542431063.0, "step": 14222 }, { "epoch": 1.8093117923928252, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.825157165527344, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8499369621276855, "num_tokens": 542471398.0, "step": 14223 }, { "epoch": 1.8094390026714158, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.59254455566406, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8471323251724243, "num_tokens": 542516288.0, "step": 14224 }, { "epoch": 1.8095662129500063, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.03457260131836, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8595516085624695, "num_tokens": 542552292.0, "step": 14225 }, { "epoch": 1.8096934232285968, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.51224136352539, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.863487958908081, "num_tokens": 542592380.0, "step": 14226 }, { "epoch": 1.8098206335071874, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.79500198364258, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8529350161552429, "num_tokens": 542632698.0, "step": 14227 }, { "epoch": 1.8099478437857779, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.39948654174805, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8506613373756409, "num_tokens": 542668234.0, "step": 14228 }, { "epoch": 1.8100750540643684, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.980079650878906, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8592531681060791, "num_tokens": 542706205.0, "step": 14229 }, { "epoch": 1.810202264342959, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.67463684082031, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8569005727767944, "num_tokens": 542737068.0, "step": 14230 }, { "epoch": 1.8103294746215495, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.919960021972656, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8725178241729736, "num_tokens": 542769776.0, "step": 14231 }, { "epoch": 1.81045668490014, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.58607864379883, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.872700572013855, "num_tokens": 542802440.0, "step": 14232 }, { "epoch": 1.8105838951787305, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.1264533996582, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8739017248153687, "num_tokens": 542841158.0, "step": 14233 }, { "epoch": 1.810711105457321, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.609188079833984, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8555799722671509, "num_tokens": 542876637.0, "step": 14234 }, { "epoch": 1.8108383157359116, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.86717224121094, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8663988709449768, "num_tokens": 542915087.0, "step": 14235 }, { "epoch": 1.8109655260145021, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.014041900634766, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8533952832221985, "num_tokens": 542950635.0, "step": 14236 }, { "epoch": 1.8110927362930926, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.635032653808594, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8565733432769775, "num_tokens": 542990078.0, "step": 14237 }, { "epoch": 1.811219946571683, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.22087097167969, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8354073762893677, "num_tokens": 543030333.0, "step": 14238 }, { "epoch": 1.8113471568502735, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.59111785888672, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8544795513153076, "num_tokens": 543070537.0, "step": 14239 }, { "epoch": 1.811474367128864, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.22420883178711, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8587901592254639, "num_tokens": 543109137.0, "step": 14240 }, { "epoch": 1.8116015774074545, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.779056549072266, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.853466272354126, "num_tokens": 543146616.0, "step": 14241 }, { "epoch": 1.811728787686045, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.789955139160156, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8748566508293152, "num_tokens": 543178433.0, "step": 14242 }, { "epoch": 1.8118559979646356, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.48237228393555, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8733241558074951, "num_tokens": 543215086.0, "step": 14243 }, { "epoch": 1.811983208243226, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.77871322631836, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.844597339630127, "num_tokens": 543252111.0, "step": 14244 }, { "epoch": 1.8121104185218164, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.16798782348633, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8690559267997742, "num_tokens": 543291938.0, "step": 14245 }, { "epoch": 1.812237628800407, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.7992057800293, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8639695644378662, "num_tokens": 543335141.0, "step": 14246 }, { "epoch": 1.8123648390789975, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.072174072265625, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8609389066696167, "num_tokens": 543371305.0, "step": 14247 }, { "epoch": 1.812492049357588, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.55359649658203, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8714630603790283, "num_tokens": 543407431.0, "step": 14248 }, { "epoch": 1.8126192596361785, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.24797058105469, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8609563708305359, "num_tokens": 543450287.0, "step": 14249 }, { "epoch": 1.812746469914769, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.04890060424805, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8740554451942444, "num_tokens": 543486521.0, "step": 14250 }, { "epoch": 1.8128736801933596, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.88753128051758, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8607889413833618, "num_tokens": 543522157.0, "step": 14251 }, { "epoch": 1.8130008904719501, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.453590393066406, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8645911812782288, "num_tokens": 543560503.0, "step": 14252 }, { "epoch": 1.8131281007505406, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.935829162597656, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8497022986412048, "num_tokens": 543598943.0, "step": 14253 }, { "epoch": 1.8132553110291312, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.87267303466797, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8619997501373291, "num_tokens": 543640466.0, "step": 14254 }, { "epoch": 1.8133825213077217, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.097599029541016, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8621309995651245, "num_tokens": 543677581.0, "step": 14255 }, { "epoch": 1.8135097315863122, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.1044807434082, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8538737297058105, "num_tokens": 543711220.0, "step": 14256 }, { "epoch": 1.8136369418649028, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.614803314208984, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8555161952972412, "num_tokens": 543750109.0, "step": 14257 }, { "epoch": 1.8137641521434933, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.2604866027832, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8650356531143188, "num_tokens": 543788040.0, "step": 14258 }, { "epoch": 1.8138913624220838, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.8149299621582, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8593977689743042, "num_tokens": 543821037.0, "step": 14259 }, { "epoch": 1.8140185727006743, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.28277587890625, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8738638758659363, "num_tokens": 543856331.0, "step": 14260 }, { "epoch": 1.8141457829792649, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.593685150146484, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.866061270236969, "num_tokens": 543892898.0, "step": 14261 }, { "epoch": 1.8142729932578552, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.639591217041016, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.857442319393158, "num_tokens": 543932858.0, "step": 14262 }, { "epoch": 1.8144002035364457, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 44.09173583984375, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8676227331161499, "num_tokens": 543968145.0, "step": 14263 }, { "epoch": 1.8145274138150362, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.1027946472168, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8601511120796204, "num_tokens": 544000495.0, "step": 14264 }, { "epoch": 1.8146546240936268, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.27677536010742, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8560925722122192, "num_tokens": 544041345.0, "step": 14265 }, { "epoch": 1.8147818343722173, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.610042572021484, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8661555051803589, "num_tokens": 544080084.0, "step": 14266 }, { "epoch": 1.8149090446508078, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.999813079833984, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8515487909317017, "num_tokens": 544115828.0, "step": 14267 }, { "epoch": 1.8150362549293981, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.892051696777344, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8575317859649658, "num_tokens": 544153213.0, "step": 14268 }, { "epoch": 1.8151634652079887, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.113956451416016, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8639013171195984, "num_tokens": 544192383.0, "step": 14269 }, { "epoch": 1.8152906754865792, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.35870361328125, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8641155362129211, "num_tokens": 544232034.0, "step": 14270 }, { "epoch": 1.8154178857651697, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.46530532836914, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8442586064338684, "num_tokens": 544266242.0, "step": 14271 }, { "epoch": 1.8155450960437602, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.26933288574219, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.850534200668335, "num_tokens": 544304995.0, "step": 14272 }, { "epoch": 1.8156723063223508, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.45071029663086, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8607429265975952, "num_tokens": 544341950.0, "step": 14273 }, { "epoch": 1.8157995166009413, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.39521408081055, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8555472493171692, "num_tokens": 544379535.0, "step": 14274 }, { "epoch": 1.8159267268795318, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.523826599121094, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8589515686035156, "num_tokens": 544416939.0, "step": 14275 }, { "epoch": 1.8160539371581224, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.57405471801758, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8738975524902344, "num_tokens": 544461476.0, "step": 14276 }, { "epoch": 1.8161811474367129, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.36487579345703, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8618631362915039, "num_tokens": 544505051.0, "step": 14277 }, { "epoch": 1.8163083577153034, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.707889556884766, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8609620332717896, "num_tokens": 544550408.0, "step": 14278 }, { "epoch": 1.816435567993894, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.427711486816406, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8607134819030762, "num_tokens": 544586645.0, "step": 14279 }, { "epoch": 1.8165627782724845, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.3281135559082, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8540211915969849, "num_tokens": 544628087.0, "step": 14280 }, { "epoch": 1.816689988551075, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.37849044799805, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.875471830368042, "num_tokens": 544667961.0, "step": 14281 }, { "epoch": 1.8168171988296655, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.027523040771484, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8586891293525696, "num_tokens": 544704919.0, "step": 14282 }, { "epoch": 1.816944409108256, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.83907699584961, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8658583760261536, "num_tokens": 544741421.0, "step": 14283 }, { "epoch": 1.8170716193868466, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.62961196899414, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8634837865829468, "num_tokens": 544776414.0, "step": 14284 }, { "epoch": 1.817198829665437, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.420413970947266, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8540077209472656, "num_tokens": 544814964.0, "step": 14285 }, { "epoch": 1.8173260399440276, "ewc_loss": 0.1513671875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00012969970703125, "grad_norm": 43.68231964111328, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8476134538650513, "num_tokens": 544853381.0, "step": 14286 }, { "epoch": 1.817453250222618, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.044517517089844, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.8632502555847168, "num_tokens": 544892474.0, "step": 14287 }, { "epoch": 1.8175804605012085, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.00819778442383, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8683419227600098, "num_tokens": 544927667.0, "step": 14288 }, { "epoch": 1.817707670779799, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.868003845214844, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8672509789466858, "num_tokens": 544959780.0, "step": 14289 }, { "epoch": 1.8178348810583895, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.92632293701172, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8520657420158386, "num_tokens": 545001832.0, "step": 14290 }, { "epoch": 1.81796209133698, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.04595947265625, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8692495226860046, "num_tokens": 545038489.0, "step": 14291 }, { "epoch": 1.8180893016155706, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.013206481933594, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8389023542404175, "num_tokens": 545069086.0, "step": 14292 }, { "epoch": 1.818216511894161, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.80873107910156, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8643792867660522, "num_tokens": 545104564.0, "step": 14293 }, { "epoch": 1.8183437221727514, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.97687911987305, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8661856651306152, "num_tokens": 545149036.0, "step": 14294 }, { "epoch": 1.818470932451342, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.880828857421875, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8726736307144165, "num_tokens": 545183825.0, "step": 14295 }, { "epoch": 1.8185981427299325, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.2735710144043, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8644587397575378, "num_tokens": 545221274.0, "step": 14296 }, { "epoch": 1.818725353008523, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.827274322509766, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8681522607803345, "num_tokens": 545259242.0, "step": 14297 }, { "epoch": 1.8188525632871135, "ewc_loss": 0.15625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.3862190246582, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8463286757469177, "num_tokens": 545289380.0, "step": 14298 }, { "epoch": 1.818979773565704, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.35109329223633, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8638405799865723, "num_tokens": 545321935.0, "step": 14299 }, { "epoch": 1.8191069838442946, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.104862213134766, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8683705925941467, "num_tokens": 545365714.0, "step": 14300 }, { "epoch": 1.8192341941228851, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.89724349975586, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8510308265686035, "num_tokens": 545406838.0, "step": 14301 }, { "epoch": 1.8193614044014756, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.799930572509766, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8573172092437744, "num_tokens": 545443265.0, "step": 14302 }, { "epoch": 1.8194886146800662, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.80923843383789, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8707424402236938, "num_tokens": 545479903.0, "step": 14303 }, { "epoch": 1.8196158249586567, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.17390823364258, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8677488565444946, "num_tokens": 545513767.0, "step": 14304 }, { "epoch": 1.8197430352372472, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.07541275024414, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8463319540023804, "num_tokens": 545550541.0, "step": 14305 }, { "epoch": 1.8198702455158378, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.6114501953125, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8625755310058594, "num_tokens": 545590739.0, "step": 14306 }, { "epoch": 1.8199974557944283, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.873435974121094, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8477911949157715, "num_tokens": 545627127.0, "step": 14307 }, { "epoch": 1.8201246660730188, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.61209487915039, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.865229606628418, "num_tokens": 545665473.0, "step": 14308 }, { "epoch": 1.8202518763516093, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.151466369628906, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8765687942504883, "num_tokens": 545708018.0, "step": 14309 }, { "epoch": 1.8203790866301999, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.604522705078125, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8647915720939636, "num_tokens": 545745635.0, "step": 14310 }, { "epoch": 1.8205062969087902, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.996822357177734, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.851141095161438, "num_tokens": 545785260.0, "step": 14311 }, { "epoch": 1.8206335071873807, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.03327560424805, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8571760654449463, "num_tokens": 545821696.0, "step": 14312 }, { "epoch": 1.8207607174659712, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.589168548583984, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.874937891960144, "num_tokens": 545857971.0, "step": 14313 }, { "epoch": 1.8208879277445618, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.34389877319336, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8501748442649841, "num_tokens": 545891000.0, "step": 14314 }, { "epoch": 1.8210151380231523, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.4462890625, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8457680940628052, "num_tokens": 545932825.0, "step": 14315 }, { "epoch": 1.8211423483017428, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.66960144042969, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8508412837982178, "num_tokens": 545972849.0, "step": 14316 }, { "epoch": 1.8212695585803331, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.51840591430664, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8741254806518555, "num_tokens": 546011492.0, "step": 14317 }, { "epoch": 1.8213967688589237, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.85782241821289, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8574219346046448, "num_tokens": 546048461.0, "step": 14318 }, { "epoch": 1.8215239791375142, "ewc_loss": 0.15234375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013065338134765625, "grad_norm": 43.43768310546875, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8704563975334167, "num_tokens": 546087113.0, "step": 14319 }, { "epoch": 1.8216511894161047, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.547908782958984, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8645721673965454, "num_tokens": 546124413.0, "step": 14320 }, { "epoch": 1.8217783996946952, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.58405303955078, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8728492259979248, "num_tokens": 546165763.0, "step": 14321 }, { "epoch": 1.8219056099732858, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.50798797607422, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8621916174888611, "num_tokens": 546203218.0, "step": 14322 }, { "epoch": 1.8220328202518763, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.750736236572266, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8505585193634033, "num_tokens": 546247610.0, "step": 14323 }, { "epoch": 1.8221600305304668, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.142704010009766, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8544976711273193, "num_tokens": 546288952.0, "step": 14324 }, { "epoch": 1.8222872408090574, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.999908447265625, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8723776340484619, "num_tokens": 546322449.0, "step": 14325 }, { "epoch": 1.8224144510876479, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.751583099365234, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8664904832839966, "num_tokens": 546362692.0, "step": 14326 }, { "epoch": 1.8225416613662384, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.92250442504883, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8455721139907837, "num_tokens": 546398856.0, "step": 14327 }, { "epoch": 1.822668871644829, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.79001235961914, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8535916805267334, "num_tokens": 546439633.0, "step": 14328 }, { "epoch": 1.8227960819234195, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.440006256103516, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8653766512870789, "num_tokens": 546477282.0, "step": 14329 }, { "epoch": 1.82292329220201, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.77818298339844, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8731885552406311, "num_tokens": 546514584.0, "step": 14330 }, { "epoch": 1.8230505024806005, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.47811508178711, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8629510402679443, "num_tokens": 546550761.0, "step": 14331 }, { "epoch": 1.823177712759191, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.20272445678711, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8472715020179749, "num_tokens": 546594157.0, "step": 14332 }, { "epoch": 1.8233049230377816, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.74727249145508, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8699300289154053, "num_tokens": 546634671.0, "step": 14333 }, { "epoch": 1.823432133316372, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.814796447753906, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8608860373497009, "num_tokens": 546672084.0, "step": 14334 }, { "epoch": 1.8235593435949626, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.887451171875, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8538639545440674, "num_tokens": 546701920.0, "step": 14335 }, { "epoch": 1.823686553873553, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.049888610839844, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8656255006790161, "num_tokens": 546735776.0, "step": 14336 }, { "epoch": 1.8238137641521435, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.93004608154297, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8668144941329956, "num_tokens": 546772991.0, "step": 14337 }, { "epoch": 1.823940974430734, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.92089080810547, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8555015921592712, "num_tokens": 546805310.0, "step": 14338 }, { "epoch": 1.8240681847093245, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.91695785522461, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8676401376724243, "num_tokens": 546838240.0, "step": 14339 }, { "epoch": 1.824195394987915, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.40922546386719, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8569875955581665, "num_tokens": 546878484.0, "step": 14340 }, { "epoch": 1.8243226052665056, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.35359191894531, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8766850829124451, "num_tokens": 546919747.0, "step": 14341 }, { "epoch": 1.8244498155450959, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.081417083740234, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8623145818710327, "num_tokens": 546962614.0, "step": 14342 }, { "epoch": 1.8245770258236864, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.81741714477539, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8552742600440979, "num_tokens": 547000059.0, "step": 14343 }, { "epoch": 1.824704236102277, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.011844635009766, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8632104396820068, "num_tokens": 547037372.0, "step": 14344 }, { "epoch": 1.8248314463808675, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.62648391723633, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8490279316902161, "num_tokens": 547071840.0, "step": 14345 }, { "epoch": 1.824958656659458, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.0546875, "learning_rate": 1e-06, "loss": 0.6529, "mean_token_accuracy": 0.8360506296157837, "num_tokens": 547108916.0, "step": 14346 }, { "epoch": 1.8250858669380485, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.57096481323242, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8796960115432739, "num_tokens": 547146991.0, "step": 14347 }, { "epoch": 1.825213077216639, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.10896301269531, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8730676174163818, "num_tokens": 547184093.0, "step": 14348 }, { "epoch": 1.8253402874952296, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.42940139770508, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8679711818695068, "num_tokens": 547217570.0, "step": 14349 }, { "epoch": 1.8254674977738201, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.16722106933594, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.860400915145874, "num_tokens": 547259498.0, "step": 14350 }, { "epoch": 1.8255947080524106, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.786617279052734, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8597245216369629, "num_tokens": 547298419.0, "step": 14351 }, { "epoch": 1.8257219183310012, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.898494720458984, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8698078393936157, "num_tokens": 547342984.0, "step": 14352 }, { "epoch": 1.8258491286095917, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.36039352416992, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8632786870002747, "num_tokens": 547383162.0, "step": 14353 }, { "epoch": 1.8259763388881822, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.76923751831055, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8826384544372559, "num_tokens": 547416967.0, "step": 14354 }, { "epoch": 1.8261035491667728, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.04912185668945, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8636595010757446, "num_tokens": 547454695.0, "step": 14355 }, { "epoch": 1.8262307594453633, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.003353118896484, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8715821504592896, "num_tokens": 547494258.0, "step": 14356 }, { "epoch": 1.8263579697239538, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.0832633972168, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.856143057346344, "num_tokens": 547531404.0, "step": 14357 }, { "epoch": 1.8264851800025443, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.84295654296875, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8659771680831909, "num_tokens": 547566553.0, "step": 14358 }, { "epoch": 1.8266123902811349, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.10010528564453, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8541895151138306, "num_tokens": 547605505.0, "step": 14359 }, { "epoch": 1.8267396005597252, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.93102264404297, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8687224388122559, "num_tokens": 547643146.0, "step": 14360 }, { "epoch": 1.8268668108383157, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.386837005615234, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.859452486038208, "num_tokens": 547683373.0, "step": 14361 }, { "epoch": 1.8269940211169062, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.728271484375, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8551687002182007, "num_tokens": 547716762.0, "step": 14362 }, { "epoch": 1.8271212313954968, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.179832458496094, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8681049942970276, "num_tokens": 547756907.0, "step": 14363 }, { "epoch": 1.8272484416740873, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 44.34530258178711, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8572587966918945, "num_tokens": 547793545.0, "step": 14364 }, { "epoch": 1.8273756519526778, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.945919036865234, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8517165780067444, "num_tokens": 547836845.0, "step": 14365 }, { "epoch": 1.8275028622312681, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.18134307861328, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8682427406311035, "num_tokens": 547875849.0, "step": 14366 }, { "epoch": 1.8276300725098586, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.75895690917969, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8543668389320374, "num_tokens": 547918672.0, "step": 14367 }, { "epoch": 1.8277572827884492, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.49541473388672, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.849135160446167, "num_tokens": 547958681.0, "step": 14368 }, { "epoch": 1.8278844930670397, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.887969970703125, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8550286293029785, "num_tokens": 547996940.0, "step": 14369 }, { "epoch": 1.8280117033456302, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.16100311279297, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.870335042476654, "num_tokens": 548030188.0, "step": 14370 }, { "epoch": 1.8281389136242208, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.86251449584961, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8777217268943787, "num_tokens": 548066891.0, "step": 14371 }, { "epoch": 1.8282661239028113, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.642940521240234, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8668591380119324, "num_tokens": 548104664.0, "step": 14372 }, { "epoch": 1.8283933341814018, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.80337142944336, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.857030987739563, "num_tokens": 548143938.0, "step": 14373 }, { "epoch": 1.8285205444599923, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.23026657104492, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8605424165725708, "num_tokens": 548183213.0, "step": 14374 }, { "epoch": 1.8286477547385829, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.4012565612793, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.84602952003479, "num_tokens": 548221304.0, "step": 14375 }, { "epoch": 1.8287749650171734, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.32421875, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8594940900802612, "num_tokens": 548261123.0, "step": 14376 }, { "epoch": 1.828902175295764, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.13388442993164, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8407491445541382, "num_tokens": 548305040.0, "step": 14377 }, { "epoch": 1.8290293855743545, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.942413330078125, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8689896464347839, "num_tokens": 548338538.0, "step": 14378 }, { "epoch": 1.829156595852945, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.94512939453125, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.864236056804657, "num_tokens": 548378510.0, "step": 14379 }, { "epoch": 1.8292838061315355, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.833763122558594, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8544435501098633, "num_tokens": 548415790.0, "step": 14380 }, { "epoch": 1.829411016410126, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.83211135864258, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8524256944656372, "num_tokens": 548447294.0, "step": 14381 }, { "epoch": 1.8295382266887166, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.342533111572266, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8685156106948853, "num_tokens": 548484617.0, "step": 14382 }, { "epoch": 1.829665436967307, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.134159088134766, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.856701135635376, "num_tokens": 548527552.0, "step": 14383 }, { "epoch": 1.8297926472458976, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.841121673583984, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8801690340042114, "num_tokens": 548561322.0, "step": 14384 }, { "epoch": 1.829919857524488, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.21215057373047, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8729516267776489, "num_tokens": 548594810.0, "step": 14385 }, { "epoch": 1.8300470678030785, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.95648193359375, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8534561395645142, "num_tokens": 548631094.0, "step": 14386 }, { "epoch": 1.830174278081669, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.38067626953125, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8615139722824097, "num_tokens": 548669162.0, "step": 14387 }, { "epoch": 1.8303014883602595, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.912315368652344, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8618214726448059, "num_tokens": 548704555.0, "step": 14388 }, { "epoch": 1.83042869863885, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.4224739074707, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8749419450759888, "num_tokens": 548741852.0, "step": 14389 }, { "epoch": 1.8305559089174406, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.23763656616211, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8480532169342041, "num_tokens": 548788455.0, "step": 14390 }, { "epoch": 1.8306831191960309, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.1735954284668, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8690712451934814, "num_tokens": 548826347.0, "step": 14391 }, { "epoch": 1.8308103294746214, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.038726806640625, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8549824953079224, "num_tokens": 548865428.0, "step": 14392 }, { "epoch": 1.830937539753212, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.303550720214844, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8474512100219727, "num_tokens": 548901671.0, "step": 14393 }, { "epoch": 1.8310647500318025, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.253440856933594, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8577399849891663, "num_tokens": 548939234.0, "step": 14394 }, { "epoch": 1.831191960310393, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.35942077636719, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8649768829345703, "num_tokens": 548975334.0, "step": 14395 }, { "epoch": 1.8313191705889835, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.151790618896484, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8561581969261169, "num_tokens": 549016372.0, "step": 14396 }, { "epoch": 1.831446380867574, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.32719802856445, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.867232084274292, "num_tokens": 549058049.0, "step": 14397 }, { "epoch": 1.8315735911461646, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.128170013427734, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.849684476852417, "num_tokens": 549097855.0, "step": 14398 }, { "epoch": 1.831700801424755, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.01097869873047, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8810751438140869, "num_tokens": 549130906.0, "step": 14399 }, { "epoch": 1.8318280117033456, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.30952453613281, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.8423207402229309, "num_tokens": 549176574.0, "step": 14400 }, { "epoch": 1.8319552219819362, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.09553909301758, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8590186834335327, "num_tokens": 549213945.0, "step": 14401 }, { "epoch": 1.8320824322605267, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.00296401977539, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.852977454662323, "num_tokens": 549255843.0, "step": 14402 }, { "epoch": 1.8322096425391172, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.23741912841797, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8653151392936707, "num_tokens": 549292573.0, "step": 14403 }, { "epoch": 1.8323368528177078, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.21355438232422, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.873924732208252, "num_tokens": 549335291.0, "step": 14404 }, { "epoch": 1.8324640630962983, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.150108337402344, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8488366007804871, "num_tokens": 549381337.0, "step": 14405 }, { "epoch": 1.8325912733748888, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.956111907958984, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8724238872528076, "num_tokens": 549418180.0, "step": 14406 }, { "epoch": 1.8327184836534793, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.091373443603516, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8705117702484131, "num_tokens": 549461623.0, "step": 14407 }, { "epoch": 1.8328456939320699, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.916839599609375, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.863620400428772, "num_tokens": 549500791.0, "step": 14408 }, { "epoch": 1.8329729042106602, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.189517974853516, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8669629096984863, "num_tokens": 549533343.0, "step": 14409 }, { "epoch": 1.8331001144892507, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.93278121948242, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8516349792480469, "num_tokens": 549572707.0, "step": 14410 }, { "epoch": 1.8332273247678412, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.51470947265625, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.868257999420166, "num_tokens": 549609772.0, "step": 14411 }, { "epoch": 1.8333545350464318, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.8985481262207, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8504863977432251, "num_tokens": 549648626.0, "step": 14412 }, { "epoch": 1.8334817453250223, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.58977127075195, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8501631617546082, "num_tokens": 549693211.0, "step": 14413 }, { "epoch": 1.8336089556036128, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.305503845214844, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8557795286178589, "num_tokens": 549729463.0, "step": 14414 }, { "epoch": 1.8337361658822031, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.081417083740234, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8668847680091858, "num_tokens": 549769139.0, "step": 14415 }, { "epoch": 1.8338633761607936, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.969337463378906, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8651214241981506, "num_tokens": 549807067.0, "step": 14416 }, { "epoch": 1.8339905864393842, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.10206604003906, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8607948422431946, "num_tokens": 549842689.0, "step": 14417 }, { "epoch": 1.8341177967179747, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.53974914550781, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8807830810546875, "num_tokens": 549876108.0, "step": 14418 }, { "epoch": 1.8342450069965652, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.065799713134766, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8757921457290649, "num_tokens": 549909656.0, "step": 14419 }, { "epoch": 1.8343722172751558, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.56367874145508, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8536918759346008, "num_tokens": 549949156.0, "step": 14420 }, { "epoch": 1.8344994275537463, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.85159683227539, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8684024214744568, "num_tokens": 549990256.0, "step": 14421 }, { "epoch": 1.8346266378323368, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.42519760131836, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8693153262138367, "num_tokens": 550023557.0, "step": 14422 }, { "epoch": 1.8347538481109273, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.8668098449707, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8572397232055664, "num_tokens": 550069402.0, "step": 14423 }, { "epoch": 1.8348810583895179, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.92277908325195, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8692988157272339, "num_tokens": 550110860.0, "step": 14424 }, { "epoch": 1.8350082686681084, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.13346481323242, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8468437194824219, "num_tokens": 550151736.0, "step": 14425 }, { "epoch": 1.835135478946699, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.84125900268555, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8634300231933594, "num_tokens": 550187731.0, "step": 14426 }, { "epoch": 1.8352626892252895, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.354896545410156, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.863855242729187, "num_tokens": 550220545.0, "step": 14427 }, { "epoch": 1.83538989950388, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.035797119140625, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8653284311294556, "num_tokens": 550254755.0, "step": 14428 }, { "epoch": 1.8355171097824705, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.02358627319336, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.869010865688324, "num_tokens": 550295191.0, "step": 14429 }, { "epoch": 1.835644320061061, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.32297134399414, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.854644775390625, "num_tokens": 550332984.0, "step": 14430 }, { "epoch": 1.8357715303396516, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.996212005615234, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8652387857437134, "num_tokens": 550377470.0, "step": 14431 }, { "epoch": 1.835898740618242, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.186702728271484, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8659580945968628, "num_tokens": 550416123.0, "step": 14432 }, { "epoch": 1.8360259508968326, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.266387939453125, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8663420677185059, "num_tokens": 550458062.0, "step": 14433 }, { "epoch": 1.836153161175423, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.93851089477539, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8553110361099243, "num_tokens": 550496485.0, "step": 14434 }, { "epoch": 1.8362803714540135, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.880157470703125, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8639497756958008, "num_tokens": 550536500.0, "step": 14435 }, { "epoch": 1.836407581732604, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.691978454589844, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8567113876342773, "num_tokens": 550575611.0, "step": 14436 }, { "epoch": 1.8365347920111945, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.71408462524414, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8532761335372925, "num_tokens": 550616431.0, "step": 14437 }, { "epoch": 1.836662002289785, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.68391799926758, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8709416389465332, "num_tokens": 550653626.0, "step": 14438 }, { "epoch": 1.8367892125683756, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.013179779052734, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8580953478813171, "num_tokens": 550690899.0, "step": 14439 }, { "epoch": 1.8369164228469659, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.584625244140625, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8619778156280518, "num_tokens": 550734408.0, "step": 14440 }, { "epoch": 1.8370436331255564, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.96053695678711, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8503512740135193, "num_tokens": 550767888.0, "step": 14441 }, { "epoch": 1.837170843404147, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.04800033569336, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8623507022857666, "num_tokens": 550807192.0, "step": 14442 }, { "epoch": 1.8372980536827375, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.73587417602539, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8705062866210938, "num_tokens": 550848006.0, "step": 14443 }, { "epoch": 1.837425263961328, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.085533142089844, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8624801635742188, "num_tokens": 550888350.0, "step": 14444 }, { "epoch": 1.8375524742399185, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.93971252441406, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8675525188446045, "num_tokens": 550925389.0, "step": 14445 }, { "epoch": 1.837679684518509, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.839290618896484, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8673693537712097, "num_tokens": 550962610.0, "step": 14446 }, { "epoch": 1.8378068947970996, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.93586730957031, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8598352670669556, "num_tokens": 551001018.0, "step": 14447 }, { "epoch": 1.83793410507569, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.980403900146484, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8624931573867798, "num_tokens": 551038505.0, "step": 14448 }, { "epoch": 1.8380613153542806, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.38523483276367, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8741447925567627, "num_tokens": 551072235.0, "step": 14449 }, { "epoch": 1.8381885256328712, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.192867279052734, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8742482662200928, "num_tokens": 551110129.0, "step": 14450 }, { "epoch": 1.8383157359114617, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.576881408691406, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8622385263442993, "num_tokens": 551152380.0, "step": 14451 }, { "epoch": 1.8384429461900522, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.28667068481445, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8733159303665161, "num_tokens": 551195942.0, "step": 14452 }, { "epoch": 1.8385701564686427, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.54643249511719, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8551027774810791, "num_tokens": 551238502.0, "step": 14453 }, { "epoch": 1.8386973667472333, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.232086181640625, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8713794350624084, "num_tokens": 551278091.0, "step": 14454 }, { "epoch": 1.8388245770258238, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.80611038208008, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.8421186208724976, "num_tokens": 551314940.0, "step": 14455 }, { "epoch": 1.8389517873044143, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.248321533203125, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8483126759529114, "num_tokens": 551356427.0, "step": 14456 }, { "epoch": 1.8390789975830049, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.02325439453125, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8663166761398315, "num_tokens": 551400436.0, "step": 14457 }, { "epoch": 1.8392062078615952, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.736610412597656, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.85938560962677, "num_tokens": 551438335.0, "step": 14458 }, { "epoch": 1.8393334181401857, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.321319580078125, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8632596135139465, "num_tokens": 551471214.0, "step": 14459 }, { "epoch": 1.8394606284187762, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.53254699707031, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8485829830169678, "num_tokens": 551504071.0, "step": 14460 }, { "epoch": 1.8395878386973668, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.26556396484375, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8564268350601196, "num_tokens": 551538456.0, "step": 14461 }, { "epoch": 1.8397150489759573, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.926334381103516, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.866725504398346, "num_tokens": 551575334.0, "step": 14462 }, { "epoch": 1.8398422592545478, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.90346145629883, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8707189559936523, "num_tokens": 551617127.0, "step": 14463 }, { "epoch": 1.8399694695331381, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.571998596191406, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8674089908599854, "num_tokens": 551655548.0, "step": 14464 }, { "epoch": 1.8400966798117286, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.644508361816406, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8596938252449036, "num_tokens": 551691807.0, "step": 14465 }, { "epoch": 1.8402238900903192, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.532352447509766, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8694806694984436, "num_tokens": 551729805.0, "step": 14466 }, { "epoch": 1.8403511003689097, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.34852981567383, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8390510082244873, "num_tokens": 551775220.0, "step": 14467 }, { "epoch": 1.8404783106475002, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.72990417480469, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8534197211265564, "num_tokens": 551816242.0, "step": 14468 }, { "epoch": 1.8406055209260908, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 42.998512268066406, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8496918678283691, "num_tokens": 551854929.0, "step": 14469 }, { "epoch": 1.8407327312046813, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.946983337402344, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8609335422515869, "num_tokens": 551898877.0, "step": 14470 }, { "epoch": 1.8408599414832718, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.45999526977539, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8772948980331421, "num_tokens": 551934252.0, "step": 14471 }, { "epoch": 1.8409871517618623, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.88323974609375, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.873026430606842, "num_tokens": 551974779.0, "step": 14472 }, { "epoch": 1.8411143620404529, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.44086456298828, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8617741465568542, "num_tokens": 552011627.0, "step": 14473 }, { "epoch": 1.8412415723190434, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.217594146728516, "learning_rate": 1e-06, "loss": 0.6422, "mean_token_accuracy": 0.8405959606170654, "num_tokens": 552045333.0, "step": 14474 }, { "epoch": 1.841368782597634, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.11894226074219, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8552727699279785, "num_tokens": 552079974.0, "step": 14475 }, { "epoch": 1.8414959928762245, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.9265022277832, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8677982091903687, "num_tokens": 552117507.0, "step": 14476 }, { "epoch": 1.841623203154815, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.39849090576172, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8606610298156738, "num_tokens": 552156009.0, "step": 14477 }, { "epoch": 1.8417504134334055, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.25654602050781, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8702908754348755, "num_tokens": 552189092.0, "step": 14478 }, { "epoch": 1.841877623711996, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.19694137573242, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8593788743019104, "num_tokens": 552228305.0, "step": 14479 }, { "epoch": 1.8420048339905866, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.73593521118164, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8634896874427795, "num_tokens": 552269402.0, "step": 14480 }, { "epoch": 1.842132044269177, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.86151123046875, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.85921311378479, "num_tokens": 552304870.0, "step": 14481 }, { "epoch": 1.8422592545477676, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.79838562011719, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8565805554389954, "num_tokens": 552336802.0, "step": 14482 }, { "epoch": 1.842386464826358, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.06260681152344, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8752448558807373, "num_tokens": 552376124.0, "step": 14483 }, { "epoch": 1.8425136751049485, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.925594329833984, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.852934718132019, "num_tokens": 552412285.0, "step": 14484 }, { "epoch": 1.842640885383539, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.24137878417969, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8613168001174927, "num_tokens": 552447527.0, "step": 14485 }, { "epoch": 1.8427680956621295, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.33176040649414, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8652298450469971, "num_tokens": 552486569.0, "step": 14486 }, { "epoch": 1.84289530594072, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.34406280517578, "learning_rate": 1e-06, "loss": 0.7042, "mean_token_accuracy": 0.827236533164978, "num_tokens": 552529158.0, "step": 14487 }, { "epoch": 1.8430225162193106, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.06084442138672, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8599976301193237, "num_tokens": 552572183.0, "step": 14488 }, { "epoch": 1.8431497264979009, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.0542106628418, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8591678142547607, "num_tokens": 552609785.0, "step": 14489 }, { "epoch": 1.8432769367764914, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.957427978515625, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8678251504898071, "num_tokens": 552641243.0, "step": 14490 }, { "epoch": 1.843404147055082, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.1322021484375, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8545846939086914, "num_tokens": 552677595.0, "step": 14491 }, { "epoch": 1.8435313573336725, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.168670654296875, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8619686365127563, "num_tokens": 552715492.0, "step": 14492 }, { "epoch": 1.843658567612263, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.24860763549805, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8619916439056396, "num_tokens": 552757410.0, "step": 14493 }, { "epoch": 1.8437857778908535, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.30763244628906, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8768895268440247, "num_tokens": 552793794.0, "step": 14494 }, { "epoch": 1.843912988169444, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.128814697265625, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.846821665763855, "num_tokens": 552836010.0, "step": 14495 }, { "epoch": 1.8440401984480346, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.56209182739258, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8638187646865845, "num_tokens": 552874136.0, "step": 14496 }, { "epoch": 1.844167408726625, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.66944885253906, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8581173419952393, "num_tokens": 552912314.0, "step": 14497 }, { "epoch": 1.8442946190052156, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.68635940551758, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.8503116369247437, "num_tokens": 552953998.0, "step": 14498 }, { "epoch": 1.8444218292838062, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.811798095703125, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8815963864326477, "num_tokens": 552987310.0, "step": 14499 }, { "epoch": 1.8445490395623967, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.743408203125, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8585271835327148, "num_tokens": 553019830.0, "step": 14500 }, { "epoch": 1.8446762498409872, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.857425689697266, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8619716167449951, "num_tokens": 553066695.0, "step": 14501 }, { "epoch": 1.8448034601195777, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.948585510253906, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8519790768623352, "num_tokens": 553098943.0, "step": 14502 }, { "epoch": 1.8449306703981683, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.86088943481445, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8661301732063293, "num_tokens": 553136106.0, "step": 14503 }, { "epoch": 1.8450578806767588, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.38361740112305, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8680666089057922, "num_tokens": 553170014.0, "step": 14504 }, { "epoch": 1.8451850909553493, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.95367431640625, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8557883501052856, "num_tokens": 553213297.0, "step": 14505 }, { "epoch": 1.8453123012339399, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.00855255126953, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8623071908950806, "num_tokens": 553248559.0, "step": 14506 }, { "epoch": 1.8454395115125302, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.51313018798828, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8636553287506104, "num_tokens": 553289140.0, "step": 14507 }, { "epoch": 1.8455667217911207, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.83955764770508, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8548314571380615, "num_tokens": 553330619.0, "step": 14508 }, { "epoch": 1.8456939320697112, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.28919219970703, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.871940016746521, "num_tokens": 553371295.0, "step": 14509 }, { "epoch": 1.8458211423483017, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.61772537231445, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8584350347518921, "num_tokens": 553410237.0, "step": 14510 }, { "epoch": 1.8459483526268923, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.47834014892578, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8687688112258911, "num_tokens": 553446727.0, "step": 14511 }, { "epoch": 1.8460755629054828, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.7296142578125, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.866951584815979, "num_tokens": 553488626.0, "step": 14512 }, { "epoch": 1.846202773184073, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.47560501098633, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8535493612289429, "num_tokens": 553528294.0, "step": 14513 }, { "epoch": 1.8463299834626636, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.90010070800781, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8659499287605286, "num_tokens": 553563738.0, "step": 14514 }, { "epoch": 1.8464571937412542, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.50871658325195, "learning_rate": 1e-06, "loss": 0.6838, "mean_token_accuracy": 0.8274484872817993, "num_tokens": 553603351.0, "step": 14515 }, { "epoch": 1.8465844040198447, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.999420166015625, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8596266508102417, "num_tokens": 553643655.0, "step": 14516 }, { "epoch": 1.8467116142984352, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.43277359008789, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8742969632148743, "num_tokens": 553677944.0, "step": 14517 }, { "epoch": 1.8468388245770258, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.242679595947266, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8577269315719604, "num_tokens": 553712120.0, "step": 14518 }, { "epoch": 1.8469660348556163, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.85020446777344, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8664926290512085, "num_tokens": 553751442.0, "step": 14519 }, { "epoch": 1.8470932451342068, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.92119598388672, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.868655264377594, "num_tokens": 553782924.0, "step": 14520 }, { "epoch": 1.8472204554127973, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.82630157470703, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8531518578529358, "num_tokens": 553818861.0, "step": 14521 }, { "epoch": 1.8473476656913879, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.349517822265625, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8595905900001526, "num_tokens": 553855593.0, "step": 14522 }, { "epoch": 1.8474748759699784, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.154541015625, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.859798789024353, "num_tokens": 553896028.0, "step": 14523 }, { "epoch": 1.847602086248569, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.43412780761719, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8662275075912476, "num_tokens": 553935212.0, "step": 14524 }, { "epoch": 1.8477292965271594, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.79372024536133, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8595185875892639, "num_tokens": 553971231.0, "step": 14525 }, { "epoch": 1.84785650680575, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.62107849121094, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8699808120727539, "num_tokens": 554006782.0, "step": 14526 }, { "epoch": 1.8479837170843405, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.1273193359375, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8552823066711426, "num_tokens": 554043427.0, "step": 14527 }, { "epoch": 1.848110927362931, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.045352935791016, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8512842655181885, "num_tokens": 554084187.0, "step": 14528 }, { "epoch": 1.8482381376415216, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.88996124267578, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8601917028427124, "num_tokens": 554122186.0, "step": 14529 }, { "epoch": 1.848365347920112, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.70115661621094, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8603091239929199, "num_tokens": 554161388.0, "step": 14530 }, { "epoch": 1.8484925581987026, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.92637252807617, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.851956844329834, "num_tokens": 554199837.0, "step": 14531 }, { "epoch": 1.848619768477293, "ewc_loss": 0.154296875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013256072998046875, "grad_norm": 43.61767578125, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8521347641944885, "num_tokens": 554239567.0, "step": 14532 }, { "epoch": 1.8487469787558835, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.690093994140625, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8640134334564209, "num_tokens": 554277040.0, "step": 14533 }, { "epoch": 1.848874189034474, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.853946685791016, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.865919828414917, "num_tokens": 554318994.0, "step": 14534 }, { "epoch": 1.8490013993130645, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.83076858520508, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8561985492706299, "num_tokens": 554355334.0, "step": 14535 }, { "epoch": 1.849128609591655, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.93656539916992, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8499740362167358, "num_tokens": 554396504.0, "step": 14536 }, { "epoch": 1.8492558198702456, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.51949691772461, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8674010634422302, "num_tokens": 554429772.0, "step": 14537 }, { "epoch": 1.8493830301488359, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.49699401855469, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8628821969032288, "num_tokens": 554470102.0, "step": 14538 }, { "epoch": 1.8495102404274264, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.34745788574219, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.853244960308075, "num_tokens": 554511632.0, "step": 14539 }, { "epoch": 1.849637450706017, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.39402770996094, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8580147624015808, "num_tokens": 554553028.0, "step": 14540 }, { "epoch": 1.8497646609846075, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.24654006958008, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8572706580162048, "num_tokens": 554585109.0, "step": 14541 }, { "epoch": 1.849891871263198, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.61305618286133, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8607826232910156, "num_tokens": 554621141.0, "step": 14542 }, { "epoch": 1.8500190815417885, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.008888244628906, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8605921268463135, "num_tokens": 554660838.0, "step": 14543 }, { "epoch": 1.850146291820379, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.60737228393555, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8634105920791626, "num_tokens": 554696338.0, "step": 14544 }, { "epoch": 1.8502735020989696, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.27511978149414, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8735024929046631, "num_tokens": 554729782.0, "step": 14545 }, { "epoch": 1.85040071237756, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.26390838623047, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8774099349975586, "num_tokens": 554767566.0, "step": 14546 }, { "epoch": 1.8505279226561506, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.526519775390625, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8656353950500488, "num_tokens": 554806063.0, "step": 14547 }, { "epoch": 1.8506551329347412, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.506019592285156, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8589366674423218, "num_tokens": 554839418.0, "step": 14548 }, { "epoch": 1.8507823432133317, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.36370086669922, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8582392930984497, "num_tokens": 554881888.0, "step": 14549 }, { "epoch": 1.8509095534919222, "ewc_loss": 0.1533203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001316070556640625, "grad_norm": 43.700042724609375, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8800557255744934, "num_tokens": 554921561.0, "step": 14550 }, { "epoch": 1.8510367637705127, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.261783599853516, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8376544713973999, "num_tokens": 554954380.0, "step": 14551 }, { "epoch": 1.8511639740491033, "ewc_loss": 0.1552734375, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.75442123413086, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8635485768318176, "num_tokens": 554994781.0, "step": 14552 }, { "epoch": 1.8512911843276938, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.230796813964844, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8683323860168457, "num_tokens": 555024938.0, "step": 14553 }, { "epoch": 1.8514183946062843, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.0227165222168, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.850602388381958, "num_tokens": 555059785.0, "step": 14554 }, { "epoch": 1.8515456048848749, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.59994125366211, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8363551497459412, "num_tokens": 555097502.0, "step": 14555 }, { "epoch": 1.8516728151634652, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.170345306396484, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8577358722686768, "num_tokens": 555139976.0, "step": 14556 }, { "epoch": 1.8518000254420557, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.5937614440918, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8653831481933594, "num_tokens": 555181250.0, "step": 14557 }, { "epoch": 1.8519272357206462, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.04338836669922, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.841374397277832, "num_tokens": 555214927.0, "step": 14558 }, { "epoch": 1.8520544459992367, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.794883728027344, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8575863838195801, "num_tokens": 555255795.0, "step": 14559 }, { "epoch": 1.8521816562778273, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.146812438964844, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8641164302825928, "num_tokens": 555294177.0, "step": 14560 }, { "epoch": 1.8523088665564178, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.67403793334961, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8748241066932678, "num_tokens": 555331548.0, "step": 14561 }, { "epoch": 1.852436076835008, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.598602294921875, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8732224702835083, "num_tokens": 555368426.0, "step": 14562 }, { "epoch": 1.8525632871135986, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.49507522583008, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8767445087432861, "num_tokens": 555411594.0, "step": 14563 }, { "epoch": 1.8526904973921892, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.37586212158203, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8543335199356079, "num_tokens": 555450412.0, "step": 14564 }, { "epoch": 1.8528177076707797, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.39683151245117, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8711293935775757, "num_tokens": 555489067.0, "step": 14565 }, { "epoch": 1.8529449179493702, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.16145706176758, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8666353225708008, "num_tokens": 555524552.0, "step": 14566 }, { "epoch": 1.8530721282279607, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.88557815551758, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8344742655754089, "num_tokens": 555563766.0, "step": 14567 }, { "epoch": 1.8531993385065513, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.178558349609375, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8691828846931458, "num_tokens": 555601410.0, "step": 14568 }, { "epoch": 1.8533265487851418, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.24641799926758, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8592355251312256, "num_tokens": 555641346.0, "step": 14569 }, { "epoch": 1.8534537590637323, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.161685943603516, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8703540563583374, "num_tokens": 555676537.0, "step": 14570 }, { "epoch": 1.8535809693423229, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.46012878417969, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.847052812576294, "num_tokens": 555721634.0, "step": 14571 }, { "epoch": 1.8537081796209134, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.91434860229492, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8621549606323242, "num_tokens": 555765447.0, "step": 14572 }, { "epoch": 1.853835389899504, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.61947250366211, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8607851266860962, "num_tokens": 555798598.0, "step": 14573 }, { "epoch": 1.8539626001780944, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.38228988647461, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8687458634376526, "num_tokens": 555829465.0, "step": 14574 }, { "epoch": 1.854089810456685, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.47629165649414, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8633626699447632, "num_tokens": 555866577.0, "step": 14575 }, { "epoch": 1.8542170207352755, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.546470642089844, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8633574843406677, "num_tokens": 555900472.0, "step": 14576 }, { "epoch": 1.854344231013866, "ewc_loss": 0.15625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.298458099365234, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8543925285339355, "num_tokens": 555936356.0, "step": 14577 }, { "epoch": 1.8544714412924566, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.66217041015625, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8543685674667358, "num_tokens": 555975338.0, "step": 14578 }, { "epoch": 1.854598651571047, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.7183837890625, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8520311117172241, "num_tokens": 556019651.0, "step": 14579 }, { "epoch": 1.8547258618496376, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.26580810546875, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8557549715042114, "num_tokens": 556054530.0, "step": 14580 }, { "epoch": 1.854853072128228, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.18846130371094, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.856825590133667, "num_tokens": 556091985.0, "step": 14581 }, { "epoch": 1.8549802824068184, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.16823959350586, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.865235447883606, "num_tokens": 556128777.0, "step": 14582 }, { "epoch": 1.855107492685409, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.941741943359375, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8794853687286377, "num_tokens": 556164286.0, "step": 14583 }, { "epoch": 1.8552347029639995, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.40641403198242, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8691140413284302, "num_tokens": 556199141.0, "step": 14584 }, { "epoch": 1.85536191324259, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.03209686279297, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8669743537902832, "num_tokens": 556237104.0, "step": 14585 }, { "epoch": 1.8554891235211806, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.5900993347168, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8515529632568359, "num_tokens": 556275343.0, "step": 14586 }, { "epoch": 1.8556163337997709, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.66654968261719, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.878052830696106, "num_tokens": 556316539.0, "step": 14587 }, { "epoch": 1.8557435440783614, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.75300216674805, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.872124969959259, "num_tokens": 556351101.0, "step": 14588 }, { "epoch": 1.855870754356952, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.95705032348633, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8725706338882446, "num_tokens": 556382458.0, "step": 14589 }, { "epoch": 1.8559979646355425, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.316959381103516, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8645156621932983, "num_tokens": 556416980.0, "step": 14590 }, { "epoch": 1.856125174914133, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.118133544921875, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8669740557670593, "num_tokens": 556451868.0, "step": 14591 }, { "epoch": 1.8562523851927235, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.11536407470703, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8500474095344543, "num_tokens": 556485682.0, "step": 14592 }, { "epoch": 1.856379595471314, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.349185943603516, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8738618493080139, "num_tokens": 556524857.0, "step": 14593 }, { "epoch": 1.8565068057499046, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.35356521606445, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8669768571853638, "num_tokens": 556565763.0, "step": 14594 }, { "epoch": 1.856634016028495, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.08237838745117, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8508126735687256, "num_tokens": 556606202.0, "step": 14595 }, { "epoch": 1.8567612263070856, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.48292922973633, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8716544508934021, "num_tokens": 556646906.0, "step": 14596 }, { "epoch": 1.8568884365856761, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.74847412109375, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.863802433013916, "num_tokens": 556684249.0, "step": 14597 }, { "epoch": 1.8570156468642667, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.398834228515625, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8509604930877686, "num_tokens": 556726840.0, "step": 14598 }, { "epoch": 1.8571428571428572, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.0872917175293, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8631070852279663, "num_tokens": 556764822.0, "step": 14599 }, { "epoch": 1.8572700674214477, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.31447982788086, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8597984313964844, "num_tokens": 556800884.0, "step": 14600 }, { "epoch": 1.8573972777000383, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.72134780883789, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8681378960609436, "num_tokens": 556841233.0, "step": 14601 }, { "epoch": 1.8575244879786288, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.173866271972656, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8558992743492126, "num_tokens": 556880995.0, "step": 14602 }, { "epoch": 1.8576516982572193, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.581787109375, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8652408719062805, "num_tokens": 556914367.0, "step": 14603 }, { "epoch": 1.8577789085358098, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.814449310302734, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8437141180038452, "num_tokens": 556955944.0, "step": 14604 }, { "epoch": 1.8579061188144002, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.3857421875, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8453957438468933, "num_tokens": 556989851.0, "step": 14605 }, { "epoch": 1.8580333290929907, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.134185791015625, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8608096837997437, "num_tokens": 557033662.0, "step": 14606 }, { "epoch": 1.8581605393715812, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.15188217163086, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8550046682357788, "num_tokens": 557074869.0, "step": 14607 }, { "epoch": 1.8582877496501717, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.85917663574219, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8500824570655823, "num_tokens": 557110164.0, "step": 14608 }, { "epoch": 1.8584149599287623, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.51362991333008, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8607007265090942, "num_tokens": 557149309.0, "step": 14609 }, { "epoch": 1.8585421702073528, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.83272933959961, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8577066659927368, "num_tokens": 557180510.0, "step": 14610 }, { "epoch": 1.858669380485943, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.15686798095703, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8670013546943665, "num_tokens": 557218062.0, "step": 14611 }, { "epoch": 1.8587965907645336, "ewc_loss": 0.1572265625, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.31229782104492, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8537302017211914, "num_tokens": 557258963.0, "step": 14612 }, { "epoch": 1.8589238010431242, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.58003234863281, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8624536991119385, "num_tokens": 557296046.0, "step": 14613 }, { "epoch": 1.8590510113217147, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.706085205078125, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.859545111656189, "num_tokens": 557328395.0, "step": 14614 }, { "epoch": 1.8591782216003052, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.35091018676758, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8649221658706665, "num_tokens": 557363888.0, "step": 14615 }, { "epoch": 1.8593054318788957, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.90818786621094, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.854070246219635, "num_tokens": 557402944.0, "step": 14616 }, { "epoch": 1.8594326421574863, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.8514404296875, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8676339387893677, "num_tokens": 557442247.0, "step": 14617 }, { "epoch": 1.8595598524360768, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.27385711669922, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8546707034111023, "num_tokens": 557483580.0, "step": 14618 }, { "epoch": 1.8596870627146673, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.67496109008789, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8660534024238586, "num_tokens": 557520134.0, "step": 14619 }, { "epoch": 1.8598142729932579, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.201847076416016, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.856248140335083, "num_tokens": 557565726.0, "step": 14620 }, { "epoch": 1.8599414832718484, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.988037109375, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8586507439613342, "num_tokens": 557602871.0, "step": 14621 }, { "epoch": 1.860068693550439, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.2181510925293, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8542261719703674, "num_tokens": 557640873.0, "step": 14622 }, { "epoch": 1.8601959038290294, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.99030303955078, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8542301654815674, "num_tokens": 557684233.0, "step": 14623 }, { "epoch": 1.86032311410762, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.13434982299805, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8673273324966431, "num_tokens": 557716895.0, "step": 14624 }, { "epoch": 1.8604503243862105, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.43497848510742, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8577107787132263, "num_tokens": 557758197.0, "step": 14625 }, { "epoch": 1.860577534664801, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.266300201416016, "learning_rate": 1e-06, "loss": 0.6672, "mean_token_accuracy": 0.8329105973243713, "num_tokens": 557794196.0, "step": 14626 }, { "epoch": 1.8607047449433916, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.62593078613281, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8444135189056396, "num_tokens": 557833856.0, "step": 14627 }, { "epoch": 1.860831955221982, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.46677780151367, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.848677933216095, "num_tokens": 557866304.0, "step": 14628 }, { "epoch": 1.8609591655005726, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.91276550292969, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8734304904937744, "num_tokens": 557907065.0, "step": 14629 }, { "epoch": 1.861086375779163, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.30561065673828, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8565067052841187, "num_tokens": 557947326.0, "step": 14630 }, { "epoch": 1.8612135860577534, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.84821701049805, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8599293828010559, "num_tokens": 557990059.0, "step": 14631 }, { "epoch": 1.861340796336344, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.57891845703125, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8631580471992493, "num_tokens": 558028406.0, "step": 14632 }, { "epoch": 1.8614680066149345, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.95624923706055, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.856635570526123, "num_tokens": 558070239.0, "step": 14633 }, { "epoch": 1.861595216893525, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.41009521484375, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8651092052459717, "num_tokens": 558107685.0, "step": 14634 }, { "epoch": 1.8617224271721156, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.253150939941406, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8541223406791687, "num_tokens": 558143185.0, "step": 14635 }, { "epoch": 1.8618496374507059, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.36665344238281, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8711791038513184, "num_tokens": 558176207.0, "step": 14636 }, { "epoch": 1.8619768477292964, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.572021484375, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8769010901451111, "num_tokens": 558212215.0, "step": 14637 }, { "epoch": 1.862104058007887, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.581207275390625, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8695154190063477, "num_tokens": 558248471.0, "step": 14638 }, { "epoch": 1.8622312682864774, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.73075866699219, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8479045033454895, "num_tokens": 558285211.0, "step": 14639 }, { "epoch": 1.862358478565068, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.620262145996094, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8546654582023621, "num_tokens": 558314434.0, "step": 14640 }, { "epoch": 1.8624856888436585, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.91056442260742, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.86942058801651, "num_tokens": 558353710.0, "step": 14641 }, { "epoch": 1.862612899122249, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.6168327331543, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.848930835723877, "num_tokens": 558390476.0, "step": 14642 }, { "epoch": 1.8627401094008396, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.52648162841797, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.86986243724823, "num_tokens": 558424917.0, "step": 14643 }, { "epoch": 1.86286731967943, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.088958740234375, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.8532708287239075, "num_tokens": 558459831.0, "step": 14644 }, { "epoch": 1.8629945299580206, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.567718505859375, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8765437006950378, "num_tokens": 558498600.0, "step": 14645 }, { "epoch": 1.8631217402366111, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.2892951965332, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8516978025436401, "num_tokens": 558537465.0, "step": 14646 }, { "epoch": 1.8632489505152017, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.21167755126953, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8803642988204956, "num_tokens": 558575357.0, "step": 14647 }, { "epoch": 1.8633761607937922, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.767616271972656, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8579845428466797, "num_tokens": 558619422.0, "step": 14648 }, { "epoch": 1.8635033710723827, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.02946090698242, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8528308272361755, "num_tokens": 558657677.0, "step": 14649 }, { "epoch": 1.8636305813509733, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.48322296142578, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8688482642173767, "num_tokens": 558692912.0, "step": 14650 }, { "epoch": 1.8637577916295638, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.191070556640625, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8733630180358887, "num_tokens": 558733470.0, "step": 14651 }, { "epoch": 1.8638850019081543, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 43.904476165771484, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8563080430030823, "num_tokens": 558771899.0, "step": 14652 }, { "epoch": 1.8640122121867448, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.188907623291016, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.86695396900177, "num_tokens": 558813002.0, "step": 14653 }, { "epoch": 1.8641394224653351, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 43.90202331542969, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8460676670074463, "num_tokens": 558850827.0, "step": 14654 }, { "epoch": 1.8642666327439257, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.019283294677734, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8584164381027222, "num_tokens": 558889003.0, "step": 14655 }, { "epoch": 1.8643938430225162, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.09830093383789, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8665103912353516, "num_tokens": 558925849.0, "step": 14656 }, { "epoch": 1.8645210533011067, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.524166107177734, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8606633543968201, "num_tokens": 558967471.0, "step": 14657 }, { "epoch": 1.8646482635796973, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.871482849121094, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8648071885108948, "num_tokens": 559001938.0, "step": 14658 }, { "epoch": 1.8647754738582878, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.92597961425781, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8675408363342285, "num_tokens": 559040515.0, "step": 14659 }, { "epoch": 1.864902684136878, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.04975891113281, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.875733494758606, "num_tokens": 559077702.0, "step": 14660 }, { "epoch": 1.8650298944154686, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.726444244384766, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8536115288734436, "num_tokens": 559114741.0, "step": 14661 }, { "epoch": 1.8651571046940592, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.277462005615234, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8389720916748047, "num_tokens": 559155844.0, "step": 14662 }, { "epoch": 1.8652843149726497, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.60932540893555, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8638492822647095, "num_tokens": 559194964.0, "step": 14663 }, { "epoch": 1.8654115252512402, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.321144104003906, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8674274682998657, "num_tokens": 559230083.0, "step": 14664 }, { "epoch": 1.8655387355298307, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.88090133666992, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8722702860832214, "num_tokens": 559268397.0, "step": 14665 }, { "epoch": 1.8656659458084213, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.91121292114258, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8533507585525513, "num_tokens": 559302024.0, "step": 14666 }, { "epoch": 1.8657931560870118, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.85975646972656, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8597086071968079, "num_tokens": 559343192.0, "step": 14667 }, { "epoch": 1.8659203663656023, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.14860153198242, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8829094767570496, "num_tokens": 559380616.0, "step": 14668 }, { "epoch": 1.8660475766441929, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.0654296875, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8714805245399475, "num_tokens": 559413621.0, "step": 14669 }, { "epoch": 1.8661747869227834, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.947166442871094, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.869349479675293, "num_tokens": 559447274.0, "step": 14670 }, { "epoch": 1.866301997201374, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.07189178466797, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8712804317474365, "num_tokens": 559484306.0, "step": 14671 }, { "epoch": 1.8664292074799644, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.10245895385742, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8685940504074097, "num_tokens": 559527126.0, "step": 14672 }, { "epoch": 1.866556417758555, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.926334381103516, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8662770390510559, "num_tokens": 559568342.0, "step": 14673 }, { "epoch": 1.8666836280371455, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.97835159301758, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8578780889511108, "num_tokens": 559609597.0, "step": 14674 }, { "epoch": 1.866810838315736, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.341102600097656, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8607259392738342, "num_tokens": 559646392.0, "step": 14675 }, { "epoch": 1.8669380485943265, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.03066635131836, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8649589419364929, "num_tokens": 559684836.0, "step": 14676 }, { "epoch": 1.867065258872917, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.28765869140625, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8659244179725647, "num_tokens": 559727793.0, "step": 14677 }, { "epoch": 1.8671924691515076, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 43.79533767700195, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.850108802318573, "num_tokens": 559768154.0, "step": 14678 }, { "epoch": 1.867319679430098, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.11984634399414, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.860762357711792, "num_tokens": 559798732.0, "step": 14679 }, { "epoch": 1.8674468897086884, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.904964447021484, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8615024089813232, "num_tokens": 559842796.0, "step": 14680 }, { "epoch": 1.867574099987279, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.90902328491211, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8583447337150574, "num_tokens": 559884198.0, "step": 14681 }, { "epoch": 1.8677013102658695, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.560184478759766, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8476641774177551, "num_tokens": 559916749.0, "step": 14682 }, { "epoch": 1.86782852054446, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.544883728027344, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.857506275177002, "num_tokens": 559958430.0, "step": 14683 }, { "epoch": 1.8679557308230506, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.47715377807617, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8779805302619934, "num_tokens": 559995720.0, "step": 14684 }, { "epoch": 1.8680829411016409, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.77859878540039, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.865368127822876, "num_tokens": 560033304.0, "step": 14685 }, { "epoch": 1.8682101513802314, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.934940338134766, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8593679070472717, "num_tokens": 560069879.0, "step": 14686 }, { "epoch": 1.868337361658822, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.86905288696289, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8592212796211243, "num_tokens": 560106853.0, "step": 14687 }, { "epoch": 1.8684645719374124, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.91547775268555, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8625220060348511, "num_tokens": 560144225.0, "step": 14688 }, { "epoch": 1.868591782216003, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 44.816524505615234, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8557014465332031, "num_tokens": 560186520.0, "step": 14689 }, { "epoch": 1.8687189924945935, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.613380432128906, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.8655215501785278, "num_tokens": 560225458.0, "step": 14690 }, { "epoch": 1.868846202773184, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.857177734375, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8674207329750061, "num_tokens": 560263401.0, "step": 14691 }, { "epoch": 1.8689734130517746, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.936763763427734, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8588778972625732, "num_tokens": 560305026.0, "step": 14692 }, { "epoch": 1.869100623330365, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.753177642822266, "learning_rate": 1e-06, "loss": 0.6716, "mean_token_accuracy": 0.8293166160583496, "num_tokens": 560354823.0, "step": 14693 }, { "epoch": 1.8692278336089556, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.565406799316406, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8773412108421326, "num_tokens": 560391888.0, "step": 14694 }, { "epoch": 1.8693550438875461, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.18315887451172, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8656086921691895, "num_tokens": 560425350.0, "step": 14695 }, { "epoch": 1.8694822541661367, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.73069381713867, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8716708421707153, "num_tokens": 560458521.0, "step": 14696 }, { "epoch": 1.8696094644447272, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.27835464477539, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.855864942073822, "num_tokens": 560506378.0, "step": 14697 }, { "epoch": 1.8697366747233177, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.555721282958984, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8830596208572388, "num_tokens": 560544055.0, "step": 14698 }, { "epoch": 1.8698638850019083, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.87807846069336, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8725607395172119, "num_tokens": 560577240.0, "step": 14699 }, { "epoch": 1.8699910952804988, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.566688537597656, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8560818433761597, "num_tokens": 560615662.0, "step": 14700 }, { "epoch": 1.8701183055590893, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013446807861328125, "grad_norm": 45.584251403808594, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8556352853775024, "num_tokens": 560658386.0, "step": 14701 }, { "epoch": 1.8702455158376798, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.5126838684082, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8594821095466614, "num_tokens": 560696366.0, "step": 14702 }, { "epoch": 1.8703727261162701, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.28811264038086, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8506655693054199, "num_tokens": 560736419.0, "step": 14703 }, { "epoch": 1.8704999363948607, "ewc_loss": 0.15625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133514404296875, "grad_norm": 44.473697662353516, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8612111210823059, "num_tokens": 560777839.0, "step": 14704 }, { "epoch": 1.8706271466734512, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.088436126708984, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8720208406448364, "num_tokens": 560816167.0, "step": 14705 }, { "epoch": 1.8707543569520417, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.742252349853516, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8484369516372681, "num_tokens": 560854524.0, "step": 14706 }, { "epoch": 1.8708815672306323, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.84060287475586, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8579388856887817, "num_tokens": 560894393.0, "step": 14707 }, { "epoch": 1.8710087775092228, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.975582122802734, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8723804354667664, "num_tokens": 560938640.0, "step": 14708 }, { "epoch": 1.871135987787813, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.4858283996582, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8602875471115112, "num_tokens": 560976805.0, "step": 14709 }, { "epoch": 1.8712631980664036, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.6898078918457, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8571971654891968, "num_tokens": 561018940.0, "step": 14710 }, { "epoch": 1.8713904083449941, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.77051544189453, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8661277890205383, "num_tokens": 561065228.0, "step": 14711 }, { "epoch": 1.8715176186235847, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.05522155761719, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8530114889144897, "num_tokens": 561107540.0, "step": 14712 }, { "epoch": 1.8716448289021752, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.8730354309082, "learning_rate": 1e-06, "loss": 0.6666, "mean_token_accuracy": 0.8398338556289673, "num_tokens": 561144774.0, "step": 14713 }, { "epoch": 1.8717720391807657, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 45.6293830871582, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8431359529495239, "num_tokens": 561186774.0, "step": 14714 }, { "epoch": 1.8718992494593563, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.595130920410156, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8746086955070496, "num_tokens": 561223997.0, "step": 14715 }, { "epoch": 1.8720264597379468, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.14924240112305, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8697120547294617, "num_tokens": 561261583.0, "step": 14716 }, { "epoch": 1.8721536700165373, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.19974899291992, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8654677867889404, "num_tokens": 561302681.0, "step": 14717 }, { "epoch": 1.8722808802951278, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.175174713134766, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8601837754249573, "num_tokens": 561338402.0, "step": 14718 }, { "epoch": 1.8724080905737184, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.20753860473633, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8665348291397095, "num_tokens": 561372892.0, "step": 14719 }, { "epoch": 1.872535300852309, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.16110610961914, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8590552806854248, "num_tokens": 561409474.0, "step": 14720 }, { "epoch": 1.8726625111308994, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.25923156738281, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8343565464019775, "num_tokens": 561450152.0, "step": 14721 }, { "epoch": 1.87278972140949, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.92006301879883, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8557183146476746, "num_tokens": 561489448.0, "step": 14722 }, { "epoch": 1.8729169316880805, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.29874801635742, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8371108770370483, "num_tokens": 561529684.0, "step": 14723 }, { "epoch": 1.873044141966671, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.43461227416992, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8628594279289246, "num_tokens": 561564259.0, "step": 14724 }, { "epoch": 1.8731713522452615, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.23550796508789, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8689554333686829, "num_tokens": 561609650.0, "step": 14725 }, { "epoch": 1.873298562523852, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.670318603515625, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8675836324691772, "num_tokens": 561645520.0, "step": 14726 }, { "epoch": 1.8734257728024426, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.552764892578125, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8686803579330444, "num_tokens": 561682771.0, "step": 14727 }, { "epoch": 1.873552983081033, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.19676971435547, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8518134355545044, "num_tokens": 561721317.0, "step": 14728 }, { "epoch": 1.8736801933596234, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.36590576171875, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8624987602233887, "num_tokens": 561763178.0, "step": 14729 }, { "epoch": 1.873807403638214, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.89839172363281, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8634215593338013, "num_tokens": 561802292.0, "step": 14730 }, { "epoch": 1.8739346139168045, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.67033004760742, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8720774054527283, "num_tokens": 561838610.0, "step": 14731 }, { "epoch": 1.874061824195395, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.955387115478516, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8647328615188599, "num_tokens": 561881992.0, "step": 14732 }, { "epoch": 1.8741890344739855, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.74040985107422, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8517592549324036, "num_tokens": 561917661.0, "step": 14733 }, { "epoch": 1.8743162447525759, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.1269416809082, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8625335097312927, "num_tokens": 561958218.0, "step": 14734 }, { "epoch": 1.8744434550311664, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.45857620239258, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8740112781524658, "num_tokens": 561993882.0, "step": 14735 }, { "epoch": 1.874570665309757, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.50160217285156, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8630077242851257, "num_tokens": 562035443.0, "step": 14736 }, { "epoch": 1.8746978755883474, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.074317932128906, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8609458208084106, "num_tokens": 562072806.0, "step": 14737 }, { "epoch": 1.874825085866938, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.354740142822266, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8452163338661194, "num_tokens": 562113652.0, "step": 14738 }, { "epoch": 1.8749522961455285, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.23872756958008, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8519888520240784, "num_tokens": 562150600.0, "step": 14739 }, { "epoch": 1.875079506424119, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.541866302490234, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8622377514839172, "num_tokens": 562187730.0, "step": 14740 }, { "epoch": 1.8752067167027096, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.163204193115234, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8620067834854126, "num_tokens": 562228229.0, "step": 14741 }, { "epoch": 1.8753339269813, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.48274230957031, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8678275346755981, "num_tokens": 562262293.0, "step": 14742 }, { "epoch": 1.8754611372598906, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.91853332519531, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8597508072853088, "num_tokens": 562303389.0, "step": 14743 }, { "epoch": 1.8755883475384811, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.218658447265625, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8547945618629456, "num_tokens": 562344461.0, "step": 14744 }, { "epoch": 1.8757155578170717, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.213260650634766, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.844682514667511, "num_tokens": 562387068.0, "step": 14745 }, { "epoch": 1.8758427680956622, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001354217529296875, "grad_norm": 44.13883590698242, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8599323034286499, "num_tokens": 562428225.0, "step": 14746 }, { "epoch": 1.8759699783742527, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.27126693725586, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8621432781219482, "num_tokens": 562463879.0, "step": 14747 }, { "epoch": 1.8760971886528433, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.3009033203125, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8537753820419312, "num_tokens": 562502327.0, "step": 14748 }, { "epoch": 1.8762243989314338, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.04004669189453, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.865926206111908, "num_tokens": 562542162.0, "step": 14749 }, { "epoch": 1.8763516092100243, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.3534049987793, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8554271459579468, "num_tokens": 562583364.0, "step": 14750 }, { "epoch": 1.8764788194886148, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.309513092041016, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.864130973815918, "num_tokens": 562615870.0, "step": 14751 }, { "epoch": 1.8766060297672051, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.33226013183594, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8468917608261108, "num_tokens": 562655086.0, "step": 14752 }, { "epoch": 1.8767332400457957, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.034446716308594, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8541380167007446, "num_tokens": 562697532.0, "step": 14753 }, { "epoch": 1.8768604503243862, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.18106460571289, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.839982271194458, "num_tokens": 562731057.0, "step": 14754 }, { "epoch": 1.8769876606029767, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.17607498168945, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8497669696807861, "num_tokens": 562770402.0, "step": 14755 }, { "epoch": 1.8771148708815673, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.64741897583008, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8564723134040833, "num_tokens": 562806191.0, "step": 14756 }, { "epoch": 1.8772420811601578, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.71770477294922, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.859393835067749, "num_tokens": 562846864.0, "step": 14757 }, { "epoch": 1.877369291438748, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.81725311279297, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8556180596351624, "num_tokens": 562882887.0, "step": 14758 }, { "epoch": 1.8774965017173386, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.48430252075195, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8608627319335938, "num_tokens": 562919096.0, "step": 14759 }, { "epoch": 1.8776237119959291, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.84084701538086, "learning_rate": 1e-06, "loss": 0.6618, "mean_token_accuracy": 0.8343711495399475, "num_tokens": 562959336.0, "step": 14760 }, { "epoch": 1.8777509222745197, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.76766586303711, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8602995276451111, "num_tokens": 562997883.0, "step": 14761 }, { "epoch": 1.8778781325531102, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.72230529785156, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8714929819107056, "num_tokens": 563036763.0, "step": 14762 }, { "epoch": 1.8780053428317007, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.989219665527344, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8769966959953308, "num_tokens": 563076815.0, "step": 14763 }, { "epoch": 1.8781325531102913, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.76467514038086, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8712871670722961, "num_tokens": 563110511.0, "step": 14764 }, { "epoch": 1.8782597633888818, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.629310607910156, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8562830090522766, "num_tokens": 563140757.0, "step": 14765 }, { "epoch": 1.8783869736674723, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.6339111328125, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8607402443885803, "num_tokens": 563179961.0, "step": 14766 }, { "epoch": 1.8785141839460628, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.51365661621094, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.867479145526886, "num_tokens": 563221355.0, "step": 14767 }, { "epoch": 1.8786413942246534, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.07225036621094, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8822078108787537, "num_tokens": 563257427.0, "step": 14768 }, { "epoch": 1.878768604503244, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.45689392089844, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8584346771240234, "num_tokens": 563294371.0, "step": 14769 }, { "epoch": 1.8788958147818344, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.1151008605957, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8698917627334595, "num_tokens": 563338592.0, "step": 14770 }, { "epoch": 1.879023025060425, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.23081970214844, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8659830689430237, "num_tokens": 563373853.0, "step": 14771 }, { "epoch": 1.8791502353390155, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.127723693847656, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8609832525253296, "num_tokens": 563411137.0, "step": 14772 }, { "epoch": 1.879277445617606, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.182674407958984, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8655937910079956, "num_tokens": 563450294.0, "step": 14773 }, { "epoch": 1.8794046558961965, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 43.703163146972656, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8632203340530396, "num_tokens": 563496858.0, "step": 14774 }, { "epoch": 1.879531866174787, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.51417541503906, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8559620976448059, "num_tokens": 563539566.0, "step": 14775 }, { "epoch": 1.8796590764533776, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.914493560791016, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8683083057403564, "num_tokens": 563574363.0, "step": 14776 }, { "epoch": 1.879786286731968, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.21681594848633, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8617175221443176, "num_tokens": 563605540.0, "step": 14777 }, { "epoch": 1.8799134970105584, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.28190612792969, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8463296294212341, "num_tokens": 563646640.0, "step": 14778 }, { "epoch": 1.880040707289149, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.15324020385742, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8711992502212524, "num_tokens": 563683576.0, "step": 14779 }, { "epoch": 1.8801679175677395, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.775089263916016, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8536139130592346, "num_tokens": 563724088.0, "step": 14780 }, { "epoch": 1.88029512784633, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.50092697143555, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8595955967903137, "num_tokens": 563760897.0, "step": 14781 }, { "epoch": 1.8804223381249205, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.61839294433594, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8654202222824097, "num_tokens": 563800158.0, "step": 14782 }, { "epoch": 1.8805495484035109, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.50428771972656, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8690418601036072, "num_tokens": 563833288.0, "step": 14783 }, { "epoch": 1.8806767586821014, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.629520416259766, "learning_rate": 1e-06, "loss": 0.5349, "mean_token_accuracy": 0.872418999671936, "num_tokens": 563874507.0, "step": 14784 }, { "epoch": 1.880803968960692, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.708736419677734, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8620408177375793, "num_tokens": 563912452.0, "step": 14785 }, { "epoch": 1.8809311792392824, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.34755325317383, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.8439865708351135, "num_tokens": 563950565.0, "step": 14786 }, { "epoch": 1.881058389517873, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.67027282714844, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8514979481697083, "num_tokens": 563984924.0, "step": 14787 }, { "epoch": 1.8811855997964635, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.37024688720703, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.8685271739959717, "num_tokens": 564022887.0, "step": 14788 }, { "epoch": 1.881312810075054, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.003684997558594, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.860015869140625, "num_tokens": 564064829.0, "step": 14789 }, { "epoch": 1.8814400203536445, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.10675048828125, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.8389649391174316, "num_tokens": 564100858.0, "step": 14790 }, { "epoch": 1.881567230632235, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.56460952758789, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.854202151298523, "num_tokens": 564141477.0, "step": 14791 }, { "epoch": 1.8816944409108256, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.157466888427734, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8545444011688232, "num_tokens": 564179271.0, "step": 14792 }, { "epoch": 1.8818216511894161, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.82209777832031, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8683608770370483, "num_tokens": 564224336.0, "step": 14793 }, { "epoch": 1.8819488614680067, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.097496032714844, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8564803004264832, "num_tokens": 564265157.0, "step": 14794 }, { "epoch": 1.8820760717465972, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.050865173339844, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8673133850097656, "num_tokens": 564295023.0, "step": 14795 }, { "epoch": 1.8822032820251877, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.0478630065918, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8630744814872742, "num_tokens": 564333700.0, "step": 14796 }, { "epoch": 1.8823304923037782, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.943782806396484, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8481079339981079, "num_tokens": 564374342.0, "step": 14797 }, { "epoch": 1.8824577025823688, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.41871643066406, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8625705242156982, "num_tokens": 564422877.0, "step": 14798 }, { "epoch": 1.8825849128609593, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.57206344604492, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8613176941871643, "num_tokens": 564464713.0, "step": 14799 }, { "epoch": 1.8827121231395498, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.71257400512695, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8495799899101257, "num_tokens": 564500667.0, "step": 14800 }, { "epoch": 1.8828393334181401, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.77802658081055, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8701595664024353, "num_tokens": 564541460.0, "step": 14801 }, { "epoch": 1.8829665436967307, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.58753204345703, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.852767825126648, "num_tokens": 564580983.0, "step": 14802 }, { "epoch": 1.8830937539753212, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.41644287109375, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8706606030464172, "num_tokens": 564615967.0, "step": 14803 }, { "epoch": 1.8832209642539117, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.62049865722656, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8516190648078918, "num_tokens": 564653255.0, "step": 14804 }, { "epoch": 1.8833481745325023, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.0522575378418, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8575359582901001, "num_tokens": 564688420.0, "step": 14805 }, { "epoch": 1.8834753848110928, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.64242172241211, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8536100387573242, "num_tokens": 564733123.0, "step": 14806 }, { "epoch": 1.883602595089683, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.25288772583008, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8684965372085571, "num_tokens": 564767712.0, "step": 14807 }, { "epoch": 1.8837298053682736, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.23976516723633, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8583955764770508, "num_tokens": 564810663.0, "step": 14808 }, { "epoch": 1.8838570156468641, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.63389587402344, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8590500354766846, "num_tokens": 564848689.0, "step": 14809 }, { "epoch": 1.8839842259254547, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.17015075683594, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8496047258377075, "num_tokens": 564887426.0, "step": 14810 }, { "epoch": 1.8841114362040452, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.81449890136719, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8654898405075073, "num_tokens": 564924852.0, "step": 14811 }, { "epoch": 1.8842386464826357, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.11537170410156, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8566842079162598, "num_tokens": 564964201.0, "step": 14812 }, { "epoch": 1.8843658567612263, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.8638801574707, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8721256852149963, "num_tokens": 565004939.0, "step": 14813 }, { "epoch": 1.8844930670398168, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.021793365478516, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8644745945930481, "num_tokens": 565044067.0, "step": 14814 }, { "epoch": 1.8846202773184073, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.12733459472656, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8636840581893921, "num_tokens": 565082114.0, "step": 14815 }, { "epoch": 1.8847474875969978, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.99311065673828, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8611880540847778, "num_tokens": 565118350.0, "step": 14816 }, { "epoch": 1.8848746978755884, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.33090591430664, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8542698621749878, "num_tokens": 565153313.0, "step": 14817 }, { "epoch": 1.885001908154179, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.960853576660156, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.849195659160614, "num_tokens": 565191332.0, "step": 14818 }, { "epoch": 1.8851291184327694, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.2708740234375, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8582243323326111, "num_tokens": 565229937.0, "step": 14819 }, { "epoch": 1.88525632871136, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.40701675415039, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8561828136444092, "num_tokens": 565266271.0, "step": 14820 }, { "epoch": 1.8853835389899505, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.4907341003418, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8589750528335571, "num_tokens": 565300593.0, "step": 14821 }, { "epoch": 1.885510749268541, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.6407356262207, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8592626452445984, "num_tokens": 565341801.0, "step": 14822 }, { "epoch": 1.8856379595471315, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.49046325683594, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8797850608825684, "num_tokens": 565374380.0, "step": 14823 }, { "epoch": 1.885765169825722, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.64821243286133, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8528883457183838, "num_tokens": 565413096.0, "step": 14824 }, { "epoch": 1.8858923801043126, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.71904754638672, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8700028657913208, "num_tokens": 565453292.0, "step": 14825 }, { "epoch": 1.886019590382903, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.54411697387695, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8537077903747559, "num_tokens": 565489311.0, "step": 14826 }, { "epoch": 1.8861468006614934, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.56843566894531, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8448086380958557, "num_tokens": 565526293.0, "step": 14827 }, { "epoch": 1.886274010940084, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.37748336791992, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8534195423126221, "num_tokens": 565569215.0, "step": 14828 }, { "epoch": 1.8864012212186745, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.6723747253418, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8725283145904541, "num_tokens": 565610755.0, "step": 14829 }, { "epoch": 1.886528431497265, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.03077697753906, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8666014671325684, "num_tokens": 565641964.0, "step": 14830 }, { "epoch": 1.8866556417758555, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.34901809692383, "learning_rate": 1e-06, "loss": 0.6808, "mean_token_accuracy": 0.8265407085418701, "num_tokens": 565682852.0, "step": 14831 }, { "epoch": 1.8867828520544458, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.457759857177734, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.877640962600708, "num_tokens": 565721352.0, "step": 14832 }, { "epoch": 1.8869100623330364, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.43897247314453, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8650273084640503, "num_tokens": 565764045.0, "step": 14833 }, { "epoch": 1.887037272611627, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.54645919799805, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8691916465759277, "num_tokens": 565797617.0, "step": 14834 }, { "epoch": 1.8871644828902174, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.159549713134766, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8580958843231201, "num_tokens": 565839254.0, "step": 14835 }, { "epoch": 1.887291693168808, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.822303771972656, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8617428541183472, "num_tokens": 565874742.0, "step": 14836 }, { "epoch": 1.8874189034473985, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.234893798828125, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8611953854560852, "num_tokens": 565911043.0, "step": 14837 }, { "epoch": 1.887546113725989, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.641902923583984, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8557041883468628, "num_tokens": 565950228.0, "step": 14838 }, { "epoch": 1.8876733240045795, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.49251937866211, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8724579811096191, "num_tokens": 565982891.0, "step": 14839 }, { "epoch": 1.88780053428317, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.21642303466797, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8583976030349731, "num_tokens": 566022601.0, "step": 14840 }, { "epoch": 1.8879277445617606, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.75517654418945, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8624135255813599, "num_tokens": 566054928.0, "step": 14841 }, { "epoch": 1.8880549548403511, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.18527603149414, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8528966307640076, "num_tokens": 566094900.0, "step": 14842 }, { "epoch": 1.8881821651189417, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.1561279296875, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8589444756507874, "num_tokens": 566125230.0, "step": 14843 }, { "epoch": 1.8883093753975322, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.96281433105469, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8750271797180176, "num_tokens": 566165320.0, "step": 14844 }, { "epoch": 1.8884365856761227, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.595787048339844, "learning_rate": 1e-06, "loss": 0.5989, "mean_token_accuracy": 0.8497040867805481, "num_tokens": 566202769.0, "step": 14845 }, { "epoch": 1.8885637959547132, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.621334075927734, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8625266551971436, "num_tokens": 566245874.0, "step": 14846 }, { "epoch": 1.8886910062333038, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.72664260864258, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8423267602920532, "num_tokens": 566284855.0, "step": 14847 }, { "epoch": 1.8888182165118943, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.89594650268555, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8525157570838928, "num_tokens": 566323422.0, "step": 14848 }, { "epoch": 1.8889454267904848, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.61797332763672, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.868158221244812, "num_tokens": 566364357.0, "step": 14849 }, { "epoch": 1.8890726370690751, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.8498420715332, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8568968176841736, "num_tokens": 566404498.0, "step": 14850 }, { "epoch": 1.8891998473476657, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.76529312133789, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.855880856513977, "num_tokens": 566442960.0, "step": 14851 }, { "epoch": 1.8893270576262562, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.4223747253418, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8690474033355713, "num_tokens": 566475087.0, "step": 14852 }, { "epoch": 1.8894542679048467, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.27400588989258, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.851681113243103, "num_tokens": 566514337.0, "step": 14853 }, { "epoch": 1.8895814781834372, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.8082160949707, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8480857610702515, "num_tokens": 566553204.0, "step": 14854 }, { "epoch": 1.8897086884620278, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.14053726196289, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8586492538452148, "num_tokens": 566589697.0, "step": 14855 }, { "epoch": 1.889835898740618, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.3582763671875, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8467109203338623, "num_tokens": 566629680.0, "step": 14856 }, { "epoch": 1.8899631090192086, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.44041061401367, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8825362920761108, "num_tokens": 566670931.0, "step": 14857 }, { "epoch": 1.8900903192977991, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.35972595214844, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.854672908782959, "num_tokens": 566706673.0, "step": 14858 }, { "epoch": 1.8902175295763897, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.466739654541016, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8560043573379517, "num_tokens": 566749503.0, "step": 14859 }, { "epoch": 1.8903447398549802, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.26648712158203, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8745759725570679, "num_tokens": 566790700.0, "step": 14860 }, { "epoch": 1.8904719501335707, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.567203521728516, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8590903282165527, "num_tokens": 566832995.0, "step": 14861 }, { "epoch": 1.8905991604121613, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.94583511352539, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8477557301521301, "num_tokens": 566868190.0, "step": 14862 }, { "epoch": 1.8907263706907518, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.30343246459961, "learning_rate": 1e-06, "loss": 0.6577, "mean_token_accuracy": 0.8317807912826538, "num_tokens": 566906637.0, "step": 14863 }, { "epoch": 1.8908535809693423, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.8626594543457, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8610973954200745, "num_tokens": 566946006.0, "step": 14864 }, { "epoch": 1.8909807912479328, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.12802505493164, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8498037457466125, "num_tokens": 566987726.0, "step": 14865 }, { "epoch": 1.8911080015265234, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.034942626953125, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8753745555877686, "num_tokens": 567027432.0, "step": 14866 }, { "epoch": 1.891235211805114, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.88698959350586, "learning_rate": 1e-06, "loss": 0.5335, "mean_token_accuracy": 0.8657614588737488, "num_tokens": 567060607.0, "step": 14867 }, { "epoch": 1.8913624220837044, "ewc_loss": 0.158203125, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 45.460994720458984, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8535337448120117, "num_tokens": 567100691.0, "step": 14868 }, { "epoch": 1.891489632362295, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.55997085571289, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8778886795043945, "num_tokens": 567139460.0, "step": 14869 }, { "epoch": 1.8916168426408855, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 45.44075012207031, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8488678932189941, "num_tokens": 567185384.0, "step": 14870 }, { "epoch": 1.891744052919476, "ewc_loss": 0.1591796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013637542724609375, "grad_norm": 44.56459045410156, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8580276370048523, "num_tokens": 567219898.0, "step": 14871 }, { "epoch": 1.8918712631980665, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.01361846923828, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8677037954330444, "num_tokens": 567260322.0, "step": 14872 }, { "epoch": 1.891998473476657, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.55813980102539, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8575893044471741, "num_tokens": 567293739.0, "step": 14873 }, { "epoch": 1.8921256837552476, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.17892837524414, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8623471260070801, "num_tokens": 567327282.0, "step": 14874 }, { "epoch": 1.892252894033838, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.499088287353516, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8538123369216919, "num_tokens": 567366402.0, "step": 14875 }, { "epoch": 1.8923801043124284, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.9785270690918, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8533509373664856, "num_tokens": 567409924.0, "step": 14876 }, { "epoch": 1.892507314591019, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.68195343017578, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8644242286682129, "num_tokens": 567450355.0, "step": 14877 }, { "epoch": 1.8926345248696095, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.776058197021484, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8746252655982971, "num_tokens": 567487701.0, "step": 14878 }, { "epoch": 1.8927617351482, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.630218505859375, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8606029748916626, "num_tokens": 567527619.0, "step": 14879 }, { "epoch": 1.8928889454267905, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.2792854309082, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8606123924255371, "num_tokens": 567567016.0, "step": 14880 }, { "epoch": 1.8930161557053808, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.36346435546875, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8491741418838501, "num_tokens": 567605465.0, "step": 14881 }, { "epoch": 1.8931433659839714, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.94581604003906, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8623507022857666, "num_tokens": 567646680.0, "step": 14882 }, { "epoch": 1.893270576262562, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.012718200683594, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8624119758605957, "num_tokens": 567685685.0, "step": 14883 }, { "epoch": 1.8933977865411524, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.643028259277344, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8505525588989258, "num_tokens": 567717222.0, "step": 14884 }, { "epoch": 1.893524996819743, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.10147476196289, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8501788973808289, "num_tokens": 567761050.0, "step": 14885 }, { "epoch": 1.8936522070983335, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.363216400146484, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8572825193405151, "num_tokens": 567801423.0, "step": 14886 }, { "epoch": 1.893779417376924, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.4057731628418, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8518832921981812, "num_tokens": 567844614.0, "step": 14887 }, { "epoch": 1.8939066276555145, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.57266616821289, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8623799681663513, "num_tokens": 567878803.0, "step": 14888 }, { "epoch": 1.894033837934105, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.29646301269531, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8742836713790894, "num_tokens": 567909419.0, "step": 14889 }, { "epoch": 1.8941610482126956, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.691261291503906, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8622037172317505, "num_tokens": 567949221.0, "step": 14890 }, { "epoch": 1.8942882584912861, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.112884521484375, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8777655959129333, "num_tokens": 567985101.0, "step": 14891 }, { "epoch": 1.8944154687698767, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.41725540161133, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.844370424747467, "num_tokens": 568030542.0, "step": 14892 }, { "epoch": 1.8945426790484672, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.276737213134766, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8621402978897095, "num_tokens": 568070723.0, "step": 14893 }, { "epoch": 1.8946698893270577, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.6949577331543, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8533041477203369, "num_tokens": 568113444.0, "step": 14894 }, { "epoch": 1.8947970996056482, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.88981246948242, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8629747629165649, "num_tokens": 568152646.0, "step": 14895 }, { "epoch": 1.8949243098842388, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.64717102050781, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8592493534088135, "num_tokens": 568187046.0, "step": 14896 }, { "epoch": 1.8950515201628293, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.52653503417969, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8554167747497559, "num_tokens": 568225052.0, "step": 14897 }, { "epoch": 1.8951787304414198, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.69449996948242, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8509271144866943, "num_tokens": 568258299.0, "step": 14898 }, { "epoch": 1.8953059407200101, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.78009033203125, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8660433292388916, "num_tokens": 568297137.0, "step": 14899 }, { "epoch": 1.8954331509986007, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.85460662841797, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8815901279449463, "num_tokens": 568331102.0, "step": 14900 }, { "epoch": 1.8955603612771912, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.89386749267578, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8578572869300842, "num_tokens": 568373926.0, "step": 14901 }, { "epoch": 1.8956875715557817, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.730812072753906, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8698481321334839, "num_tokens": 568406470.0, "step": 14902 }, { "epoch": 1.8958147818343722, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.16527557373047, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8692896962165833, "num_tokens": 568443452.0, "step": 14903 }, { "epoch": 1.8959419921129628, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.71286392211914, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8551969528198242, "num_tokens": 568480085.0, "step": 14904 }, { "epoch": 1.896069202391553, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.05820083618164, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8568050861358643, "num_tokens": 568519191.0, "step": 14905 }, { "epoch": 1.8961964126701436, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.46436309814453, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8662275075912476, "num_tokens": 568554224.0, "step": 14906 }, { "epoch": 1.8963236229487341, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.795166015625, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8776386976242065, "num_tokens": 568586102.0, "step": 14907 }, { "epoch": 1.8964508332273247, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.0515022277832, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8545817136764526, "num_tokens": 568625224.0, "step": 14908 }, { "epoch": 1.8965780435059152, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.9836540222168, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8403415679931641, "num_tokens": 568665588.0, "step": 14909 }, { "epoch": 1.8967052537845057, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.8950309753418, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8773805499076843, "num_tokens": 568698408.0, "step": 14910 }, { "epoch": 1.8968324640630962, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.777976989746094, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8495920896530151, "num_tokens": 568731982.0, "step": 14911 }, { "epoch": 1.8969596743416868, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.89739227294922, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8593389987945557, "num_tokens": 568769388.0, "step": 14912 }, { "epoch": 1.8970868846202773, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.6280632019043, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.86588454246521, "num_tokens": 568810329.0, "step": 14913 }, { "epoch": 1.8972140948988678, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.807891845703125, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8638856410980225, "num_tokens": 568852241.0, "step": 14914 }, { "epoch": 1.8973413051774584, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.754051208496094, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8557287454605103, "num_tokens": 568889660.0, "step": 14915 }, { "epoch": 1.8974685154560489, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.77946090698242, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8573906421661377, "num_tokens": 568934489.0, "step": 14916 }, { "epoch": 1.8975957257346394, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.9815673828125, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8812210559844971, "num_tokens": 568971904.0, "step": 14917 }, { "epoch": 1.89772293601323, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.65740203857422, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.840695858001709, "num_tokens": 569011863.0, "step": 14918 }, { "epoch": 1.8978501462918205, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.54825973510742, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8821683526039124, "num_tokens": 569050295.0, "step": 14919 }, { "epoch": 1.897977356570411, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.86074447631836, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.854381263256073, "num_tokens": 569087013.0, "step": 14920 }, { "epoch": 1.8981045668490015, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.17759323120117, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8537518978118896, "num_tokens": 569125226.0, "step": 14921 }, { "epoch": 1.898231777127592, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.412960052490234, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8574070930480957, "num_tokens": 569161910.0, "step": 14922 }, { "epoch": 1.8983589874061826, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.01225280761719, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8615926504135132, "num_tokens": 569196519.0, "step": 14923 }, { "epoch": 1.898486197684773, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.293216705322266, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.857296347618103, "num_tokens": 569234404.0, "step": 14924 }, { "epoch": 1.8986134079633634, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.76046371459961, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8527824878692627, "num_tokens": 569272728.0, "step": 14925 }, { "epoch": 1.898740618241954, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.56478500366211, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8518422245979309, "num_tokens": 569315434.0, "step": 14926 }, { "epoch": 1.8988678285205445, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.89532470703125, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8446343541145325, "num_tokens": 569357473.0, "step": 14927 }, { "epoch": 1.898995038799135, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.73076629638672, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8481097221374512, "num_tokens": 569398747.0, "step": 14928 }, { "epoch": 1.8991222490777255, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.49958801269531, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8691439628601074, "num_tokens": 569434855.0, "step": 14929 }, { "epoch": 1.8992494593563158, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.572776794433594, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8612081408500671, "num_tokens": 569475236.0, "step": 14930 }, { "epoch": 1.8993766696349064, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.90726852416992, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8589948415756226, "num_tokens": 569513599.0, "step": 14931 }, { "epoch": 1.899503879913497, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.74534225463867, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8675265312194824, "num_tokens": 569559059.0, "step": 14932 }, { "epoch": 1.8996310901920874, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.76181411743164, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8792868256568909, "num_tokens": 569595240.0, "step": 14933 }, { "epoch": 1.899758300470678, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.494110107421875, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.850914478302002, "num_tokens": 569629336.0, "step": 14934 }, { "epoch": 1.8998855107492685, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.97999572753906, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8726568222045898, "num_tokens": 569663613.0, "step": 14935 }, { "epoch": 1.900012721027859, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.060482025146484, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.844483494758606, "num_tokens": 569702884.0, "step": 14936 }, { "epoch": 1.9001399313064495, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.05080795288086, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8751842379570007, "num_tokens": 569744618.0, "step": 14937 }, { "epoch": 1.90026714158504, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.334224700927734, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8692929148674011, "num_tokens": 569777083.0, "step": 14938 }, { "epoch": 1.9003943518636306, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.114315032958984, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8661321997642517, "num_tokens": 569809652.0, "step": 14939 }, { "epoch": 1.9005215621422211, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.000308990478516, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8650026917457581, "num_tokens": 569846465.0, "step": 14940 }, { "epoch": 1.9006487724208116, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 43.85538101196289, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.852393388748169, "num_tokens": 569883655.0, "step": 14941 }, { "epoch": 1.9007759826994022, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.269569396972656, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8596804141998291, "num_tokens": 569923358.0, "step": 14942 }, { "epoch": 1.9009031929779927, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.09965133666992, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8556163311004639, "num_tokens": 569961393.0, "step": 14943 }, { "epoch": 1.9010304032565832, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.385128021240234, "learning_rate": 1e-06, "loss": 0.6639, "mean_token_accuracy": 0.8337334394454956, "num_tokens": 570002092.0, "step": 14944 }, { "epoch": 1.9011576135351738, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.55659103393555, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8452978134155273, "num_tokens": 570037699.0, "step": 14945 }, { "epoch": 1.9012848238137643, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.01528549194336, "learning_rate": 1e-06, "loss": 0.6477, "mean_token_accuracy": 0.8363065719604492, "num_tokens": 570077693.0, "step": 14946 }, { "epoch": 1.9014120340923548, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.3927116394043, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8804238438606262, "num_tokens": 570115354.0, "step": 14947 }, { "epoch": 1.9015392443709451, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.62919998168945, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8673495054244995, "num_tokens": 570154387.0, "step": 14948 }, { "epoch": 1.9016664546495357, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.41851806640625, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8554431200027466, "num_tokens": 570189743.0, "step": 14949 }, { "epoch": 1.9017936649281262, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.5888786315918, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8726687431335449, "num_tokens": 570224009.0, "step": 14950 }, { "epoch": 1.9019208752067167, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.38138198852539, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8681640625, "num_tokens": 570266703.0, "step": 14951 }, { "epoch": 1.9020480854853072, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.23622131347656, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8612645864486694, "num_tokens": 570308241.0, "step": 14952 }, { "epoch": 1.9021752957638978, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.969764709472656, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8594132661819458, "num_tokens": 570346931.0, "step": 14953 }, { "epoch": 1.902302506042488, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.90093994140625, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8749374151229858, "num_tokens": 570380373.0, "step": 14954 }, { "epoch": 1.9024297163210786, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.041587829589844, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8717090487480164, "num_tokens": 570420106.0, "step": 14955 }, { "epoch": 1.9025569265996691, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.76096725463867, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8828494548797607, "num_tokens": 570464534.0, "step": 14956 }, { "epoch": 1.9026841368782597, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.958492279052734, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8500819802284241, "num_tokens": 570505734.0, "step": 14957 }, { "epoch": 1.9028113471568502, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.18075180053711, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8708329200744629, "num_tokens": 570545913.0, "step": 14958 }, { "epoch": 1.9029385574354407, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.10403823852539, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8675600290298462, "num_tokens": 570584397.0, "step": 14959 }, { "epoch": 1.9030657677140312, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.99288558959961, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8657676577568054, "num_tokens": 570622285.0, "step": 14960 }, { "epoch": 1.9031929779926218, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.350486755371094, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8451225757598877, "num_tokens": 570656331.0, "step": 14961 }, { "epoch": 1.9033201882712123, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.244659423828125, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.865942120552063, "num_tokens": 570699944.0, "step": 14962 }, { "epoch": 1.9034473985498028, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.74135971069336, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.855503499507904, "num_tokens": 570737649.0, "step": 14963 }, { "epoch": 1.9035746088283934, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.34660339355469, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8602495789527893, "num_tokens": 570773193.0, "step": 14964 }, { "epoch": 1.9037018191069839, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.90948486328125, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8549961447715759, "num_tokens": 570809117.0, "step": 14965 }, { "epoch": 1.9038290293855744, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.60085678100586, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8736891746520996, "num_tokens": 570844268.0, "step": 14966 }, { "epoch": 1.903956239664165, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.85269546508789, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8516234755516052, "num_tokens": 570890166.0, "step": 14967 }, { "epoch": 1.9040834499427555, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.3499641418457, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8563458919525146, "num_tokens": 570921419.0, "step": 14968 }, { "epoch": 1.904210660221346, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.72807312011719, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8531429171562195, "num_tokens": 570966444.0, "step": 14969 }, { "epoch": 1.9043378704999365, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.888675689697266, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8715672492980957, "num_tokens": 571011656.0, "step": 14970 }, { "epoch": 1.904465080778527, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.93556594848633, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8660745620727539, "num_tokens": 571047828.0, "step": 14971 }, { "epoch": 1.9045922910571176, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 43.94219970703125, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8591413497924805, "num_tokens": 571084541.0, "step": 14972 }, { "epoch": 1.9047195013357079, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.7576789855957, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.855904221534729, "num_tokens": 571120978.0, "step": 14973 }, { "epoch": 1.9048467116142984, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 43.895408630371094, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.874517560005188, "num_tokens": 571160181.0, "step": 14974 }, { "epoch": 1.904973921892889, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.925636291503906, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8650029897689819, "num_tokens": 571194648.0, "step": 14975 }, { "epoch": 1.9051011321714795, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.82966613769531, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8549858927726746, "num_tokens": 571238449.0, "step": 14976 }, { "epoch": 1.90522834245007, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.06170654296875, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8608141541481018, "num_tokens": 571277266.0, "step": 14977 }, { "epoch": 1.9053555527286605, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.26663589477539, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8724994659423828, "num_tokens": 571316891.0, "step": 14978 }, { "epoch": 1.9054827630072508, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.47121810913086, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8622148036956787, "num_tokens": 571353097.0, "step": 14979 }, { "epoch": 1.9056099732858414, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.75811767578125, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8757700324058533, "num_tokens": 571392217.0, "step": 14980 }, { "epoch": 1.905737183564432, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.579105377197266, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8706606030464172, "num_tokens": 571430440.0, "step": 14981 }, { "epoch": 1.9058643938430224, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.40473937988281, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8575618267059326, "num_tokens": 571472002.0, "step": 14982 }, { "epoch": 1.905991604121613, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.2672233581543, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8422727584838867, "num_tokens": 571509368.0, "step": 14983 }, { "epoch": 1.9061188144002035, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.4256477355957, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8468903303146362, "num_tokens": 571543898.0, "step": 14984 }, { "epoch": 1.906246024678794, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.378395080566406, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8548544645309448, "num_tokens": 571584395.0, "step": 14985 }, { "epoch": 1.9063732349573845, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.184600830078125, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8612881898880005, "num_tokens": 571624792.0, "step": 14986 }, { "epoch": 1.906500445235975, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.23746109008789, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8644901514053345, "num_tokens": 571667197.0, "step": 14987 }, { "epoch": 1.9066276555145656, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.36741638183594, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8551819324493408, "num_tokens": 571703648.0, "step": 14988 }, { "epoch": 1.9067548657931561, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 43.87632751464844, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8512742519378662, "num_tokens": 571738980.0, "step": 14989 }, { "epoch": 1.9068820760717466, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.43349075317383, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8625335097312927, "num_tokens": 571774540.0, "step": 14990 }, { "epoch": 1.9070092863503372, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.476409912109375, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8679096698760986, "num_tokens": 571805286.0, "step": 14991 }, { "epoch": 1.9071364966289277, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.12153625488281, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8552797436714172, "num_tokens": 571847936.0, "step": 14992 }, { "epoch": 1.9072637069075182, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.587890625, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.856035590171814, "num_tokens": 571885454.0, "step": 14993 }, { "epoch": 1.9073909171861088, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.0743522644043, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8591523766517639, "num_tokens": 571924870.0, "step": 14994 }, { "epoch": 1.9075181274646993, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.38351821899414, "learning_rate": 1e-06, "loss": 0.6539, "mean_token_accuracy": 0.8379093408584595, "num_tokens": 571957870.0, "step": 14995 }, { "epoch": 1.9076453377432898, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.28251647949219, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8521095514297485, "num_tokens": 571993645.0, "step": 14996 }, { "epoch": 1.9077725480218801, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.3137092590332, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8701459169387817, "num_tokens": 572028594.0, "step": 14997 }, { "epoch": 1.9078997583004706, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.26128387451172, "learning_rate": 1e-06, "loss": 0.6606, "mean_token_accuracy": 0.8328876495361328, "num_tokens": 572066971.0, "step": 14998 }, { "epoch": 1.9080269685790612, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.510929107666016, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8654009103775024, "num_tokens": 572107609.0, "step": 14999 }, { "epoch": 1.9081541788576517, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.509925842285156, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8691407442092896, "num_tokens": 572146940.0, "step": 15000 }, { "epoch": 1.9082813891362422, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.04890060424805, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8857150077819824, "num_tokens": 572184010.0, "step": 15001 }, { "epoch": 1.9084085994148328, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.61491012573242, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8430492877960205, "num_tokens": 572223184.0, "step": 15002 }, { "epoch": 1.908535809693423, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.4709358215332, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8609505891799927, "num_tokens": 572261770.0, "step": 15003 }, { "epoch": 1.9086630199720136, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.522430419921875, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8626676201820374, "num_tokens": 572303229.0, "step": 15004 }, { "epoch": 1.9087902302506041, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.93522644042969, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8477592468261719, "num_tokens": 572337839.0, "step": 15005 }, { "epoch": 1.9089174405291947, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.155609130859375, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8589404821395874, "num_tokens": 572373604.0, "step": 15006 }, { "epoch": 1.9090446508077852, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.908565521240234, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8672661781311035, "num_tokens": 572410425.0, "step": 15007 }, { "epoch": 1.9091718610863757, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.024497985839844, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.870801568031311, "num_tokens": 572443998.0, "step": 15008 }, { "epoch": 1.9092990713649662, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.69832229614258, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8478963971138, "num_tokens": 572485337.0, "step": 15009 }, { "epoch": 1.9094262816435568, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.00688171386719, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8503598570823669, "num_tokens": 572528273.0, "step": 15010 }, { "epoch": 1.9095534919221473, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.71611785888672, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8639333844184875, "num_tokens": 572569467.0, "step": 15011 }, { "epoch": 1.9096807022007378, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.1158561706543, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8685718178749084, "num_tokens": 572609626.0, "step": 15012 }, { "epoch": 1.9098079124793284, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.88288879394531, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.845055341720581, "num_tokens": 572652907.0, "step": 15013 }, { "epoch": 1.9099351227579189, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.24039840698242, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8662413358688354, "num_tokens": 572687460.0, "step": 15014 }, { "epoch": 1.9100623330365094, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.46070098876953, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8653541803359985, "num_tokens": 572723937.0, "step": 15015 }, { "epoch": 1.9101895433151, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.66969680786133, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8552830219268799, "num_tokens": 572764929.0, "step": 15016 }, { "epoch": 1.9103167535936905, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.02598571777344, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8595280647277832, "num_tokens": 572802545.0, "step": 15017 }, { "epoch": 1.910443963872281, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.7370491027832, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.855131983757019, "num_tokens": 572850008.0, "step": 15018 }, { "epoch": 1.9105711741508715, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.0509147644043, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8679872751235962, "num_tokens": 572888968.0, "step": 15019 }, { "epoch": 1.910698384429462, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.46503448486328, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8592109680175781, "num_tokens": 572926314.0, "step": 15020 }, { "epoch": 1.9108255947080524, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.25896072387695, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8649770617485046, "num_tokens": 572961789.0, "step": 15021 }, { "epoch": 1.9109528049866429, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.807891845703125, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8618706464767456, "num_tokens": 572993680.0, "step": 15022 }, { "epoch": 1.9110800152652334, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.45541000366211, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8627861738204956, "num_tokens": 573024003.0, "step": 15023 }, { "epoch": 1.911207225543824, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.67383575439453, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8679535388946533, "num_tokens": 573055138.0, "step": 15024 }, { "epoch": 1.9113344358224145, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 43.89835739135742, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8649810552597046, "num_tokens": 573092373.0, "step": 15025 }, { "epoch": 1.911461646101005, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.02750015258789, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8560435771942139, "num_tokens": 573131486.0, "step": 15026 }, { "epoch": 1.9115888563795955, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.3386344909668, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8692928552627563, "num_tokens": 573170114.0, "step": 15027 }, { "epoch": 1.9117160666581858, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.66815948486328, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.864141583442688, "num_tokens": 573200906.0, "step": 15028 }, { "epoch": 1.9118432769367764, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.40995788574219, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8487428426742554, "num_tokens": 573243731.0, "step": 15029 }, { "epoch": 1.9119704872153669, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.53990173339844, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.867060661315918, "num_tokens": 573283335.0, "step": 15030 }, { "epoch": 1.9120976974939574, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.26716613769531, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8466191291809082, "num_tokens": 573326628.0, "step": 15031 }, { "epoch": 1.912224907772548, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.41444396972656, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8667762279510498, "num_tokens": 573369025.0, "step": 15032 }, { "epoch": 1.9123521180511385, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.45856857299805, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8528534173965454, "num_tokens": 573411923.0, "step": 15033 }, { "epoch": 1.912479328329729, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.124359130859375, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8559750318527222, "num_tokens": 573450455.0, "step": 15034 }, { "epoch": 1.9126065386083195, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.521480560302734, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8518223762512207, "num_tokens": 573492886.0, "step": 15035 }, { "epoch": 1.91273374888691, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.52822494506836, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.8665461540222168, "num_tokens": 573532073.0, "step": 15036 }, { "epoch": 1.9128609591655006, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.08580017089844, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8777540326118469, "num_tokens": 573573632.0, "step": 15037 }, { "epoch": 1.9129881694440911, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.42728805541992, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8702690005302429, "num_tokens": 573610194.0, "step": 15038 }, { "epoch": 1.9131153797226816, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.481075286865234, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8516983985900879, "num_tokens": 573650237.0, "step": 15039 }, { "epoch": 1.9132425900012722, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.6754035949707, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.864627480506897, "num_tokens": 573686859.0, "step": 15040 }, { "epoch": 1.9133698002798627, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.42173767089844, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8608443140983582, "num_tokens": 573726770.0, "step": 15041 }, { "epoch": 1.9134970105584532, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.53932189941406, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8558881878852844, "num_tokens": 573768668.0, "step": 15042 }, { "epoch": 1.9136242208370438, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.57590103149414, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8515430688858032, "num_tokens": 573807880.0, "step": 15043 }, { "epoch": 1.9137514311156343, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.609378814697266, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8588838577270508, "num_tokens": 573847018.0, "step": 15044 }, { "epoch": 1.9138786413942248, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.5399169921875, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8764640688896179, "num_tokens": 573881193.0, "step": 15045 }, { "epoch": 1.9140058516728151, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.660797119140625, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8677634000778198, "num_tokens": 573918457.0, "step": 15046 }, { "epoch": 1.9141330619514056, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.17779541015625, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8818486928939819, "num_tokens": 573958818.0, "step": 15047 }, { "epoch": 1.9142602722299962, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.877498626708984, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8569900393486023, "num_tokens": 573996735.0, "step": 15048 }, { "epoch": 1.9143874825085867, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.3358154296875, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8645603060722351, "num_tokens": 574035005.0, "step": 15049 }, { "epoch": 1.9145146927871772, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.76149368286133, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8598923683166504, "num_tokens": 574077817.0, "step": 15050 }, { "epoch": 1.9146419030657678, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.663673400878906, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.868465006351471, "num_tokens": 574113083.0, "step": 15051 }, { "epoch": 1.914769113344358, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.49655532836914, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.864922046661377, "num_tokens": 574147324.0, "step": 15052 }, { "epoch": 1.9148963236229486, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.01763153076172, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8771410584449768, "num_tokens": 574186108.0, "step": 15053 }, { "epoch": 1.9150235339015391, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.55437469482422, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8735265731811523, "num_tokens": 574218076.0, "step": 15054 }, { "epoch": 1.9151507441801296, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.70460891723633, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8720221519470215, "num_tokens": 574256416.0, "step": 15055 }, { "epoch": 1.9152779544587202, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.76443862915039, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8743038177490234, "num_tokens": 574295262.0, "step": 15056 }, { "epoch": 1.9154051647373107, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.38032531738281, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8501663208007812, "num_tokens": 574329836.0, "step": 15057 }, { "epoch": 1.9155323750159012, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.802371978759766, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8631687164306641, "num_tokens": 574363189.0, "step": 15058 }, { "epoch": 1.9156595852944918, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.50258255004883, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.870578408241272, "num_tokens": 574396374.0, "step": 15059 }, { "epoch": 1.9157867955730823, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.79937744140625, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8652649521827698, "num_tokens": 574435488.0, "step": 15060 }, { "epoch": 1.9159140058516728, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.45509338378906, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8611220121383667, "num_tokens": 574475261.0, "step": 15061 }, { "epoch": 1.9160412161302633, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.75197982788086, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8810268044471741, "num_tokens": 574513531.0, "step": 15062 }, { "epoch": 1.9161684264088539, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.662750244140625, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8476588726043701, "num_tokens": 574551736.0, "step": 15063 }, { "epoch": 1.9162956366874444, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.31877899169922, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.854827344417572, "num_tokens": 574589978.0, "step": 15064 }, { "epoch": 1.916422846966035, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.131587982177734, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8402309417724609, "num_tokens": 574628845.0, "step": 15065 }, { "epoch": 1.9165500572446255, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.603126525878906, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8575021028518677, "num_tokens": 574660700.0, "step": 15066 }, { "epoch": 1.916677267523216, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.514076232910156, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8530393838882446, "num_tokens": 574692195.0, "step": 15067 }, { "epoch": 1.9168044778018065, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.808284759521484, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.850405216217041, "num_tokens": 574732462.0, "step": 15068 }, { "epoch": 1.916931688080397, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.6042594909668, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8681185841560364, "num_tokens": 574769353.0, "step": 15069 }, { "epoch": 1.9170588983589874, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.666839599609375, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.859952449798584, "num_tokens": 574804958.0, "step": 15070 }, { "epoch": 1.9171861086375779, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.4097785949707, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8613479733467102, "num_tokens": 574843966.0, "step": 15071 }, { "epoch": 1.9173133189161684, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.09640121459961, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8666899800300598, "num_tokens": 574882916.0, "step": 15072 }, { "epoch": 1.917440529194759, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.11056137084961, "learning_rate": 1e-06, "loss": 0.5409, "mean_token_accuracy": 0.8700248599052429, "num_tokens": 574922154.0, "step": 15073 }, { "epoch": 1.9175677394733495, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.100650787353516, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.856031060218811, "num_tokens": 574956894.0, "step": 15074 }, { "epoch": 1.91769494975194, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.41176223754883, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8757025003433228, "num_tokens": 574998343.0, "step": 15075 }, { "epoch": 1.9178221600305305, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.87572479248047, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8706507682800293, "num_tokens": 575036689.0, "step": 15076 }, { "epoch": 1.9179493703091208, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.3807487487793, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8557443618774414, "num_tokens": 575075510.0, "step": 15077 }, { "epoch": 1.9180765805877114, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.801666259765625, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8577706813812256, "num_tokens": 575115797.0, "step": 15078 }, { "epoch": 1.9182037908663019, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.5784797668457, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.8484554886817932, "num_tokens": 575155240.0, "step": 15079 }, { "epoch": 1.9183310011448924, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.62197494506836, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8481758832931519, "num_tokens": 575191048.0, "step": 15080 }, { "epoch": 1.918458211423483, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.58024978637695, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.849705159664154, "num_tokens": 575227778.0, "step": 15081 }, { "epoch": 1.9185854217020735, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.26560974121094, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8628411889076233, "num_tokens": 575269774.0, "step": 15082 }, { "epoch": 1.918712631980664, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.79313278198242, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8512163162231445, "num_tokens": 575310710.0, "step": 15083 }, { "epoch": 1.9188398422592545, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.40336227416992, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8693559169769287, "num_tokens": 575344073.0, "step": 15084 }, { "epoch": 1.918967052537845, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.696712493896484, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8513591289520264, "num_tokens": 575386737.0, "step": 15085 }, { "epoch": 1.9190942628164356, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.330562591552734, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8655574321746826, "num_tokens": 575424287.0, "step": 15086 }, { "epoch": 1.919221473095026, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.22893524169922, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8627750873565674, "num_tokens": 575463859.0, "step": 15087 }, { "epoch": 1.9193486833736166, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.01185989379883, "learning_rate": 1e-06, "loss": 0.5989, "mean_token_accuracy": 0.8560402989387512, "num_tokens": 575500838.0, "step": 15088 }, { "epoch": 1.9194758936522072, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.24653625488281, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8540552854537964, "num_tokens": 575535164.0, "step": 15089 }, { "epoch": 1.9196031039307977, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.30978775024414, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.86687833070755, "num_tokens": 575569493.0, "step": 15090 }, { "epoch": 1.9197303142093882, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.691890716552734, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8444734811782837, "num_tokens": 575603396.0, "step": 15091 }, { "epoch": 1.9198575244879788, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.682899475097656, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8669952154159546, "num_tokens": 575638255.0, "step": 15092 }, { "epoch": 1.9199847347665693, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.72365188598633, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8500317335128784, "num_tokens": 575685281.0, "step": 15093 }, { "epoch": 1.9201119450451598, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.62434387207031, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.867785632610321, "num_tokens": 575725030.0, "step": 15094 }, { "epoch": 1.9202391553237501, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.88379669189453, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.854236364364624, "num_tokens": 575764237.0, "step": 15095 }, { "epoch": 1.9203663656023406, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.36884689331055, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8587886095046997, "num_tokens": 575796732.0, "step": 15096 }, { "epoch": 1.9204935758809312, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.903953552246094, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8668489456176758, "num_tokens": 575833133.0, "step": 15097 }, { "epoch": 1.9206207861595217, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.125003814697266, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8566500544548035, "num_tokens": 575870920.0, "step": 15098 }, { "epoch": 1.9207479964381122, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.00529479980469, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8639817237854004, "num_tokens": 575909132.0, "step": 15099 }, { "epoch": 1.9208752067167028, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.09290313720703, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8597016930580139, "num_tokens": 575947979.0, "step": 15100 }, { "epoch": 1.921002416995293, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.832427978515625, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8496293425559998, "num_tokens": 575988888.0, "step": 15101 }, { "epoch": 1.9211296272738836, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.4932975769043, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8627223968505859, "num_tokens": 576028614.0, "step": 15102 }, { "epoch": 1.9212568375524741, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.63383483886719, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8888858556747437, "num_tokens": 576066648.0, "step": 15103 }, { "epoch": 1.9213840478310646, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.309104919433594, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8577462434768677, "num_tokens": 576104045.0, "step": 15104 }, { "epoch": 1.9215112581096552, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.28032302856445, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8542345762252808, "num_tokens": 576145419.0, "step": 15105 }, { "epoch": 1.9216384683882457, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.994503021240234, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8569515943527222, "num_tokens": 576191410.0, "step": 15106 }, { "epoch": 1.9217656786668362, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.46175765991211, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8556801080703735, "num_tokens": 576226383.0, "step": 15107 }, { "epoch": 1.9218928889454268, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.7669677734375, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8532558679580688, "num_tokens": 576259958.0, "step": 15108 }, { "epoch": 1.9220200992240173, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.415679931640625, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8649158477783203, "num_tokens": 576299467.0, "step": 15109 }, { "epoch": 1.9221473095026078, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.90036392211914, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8538423776626587, "num_tokens": 576334137.0, "step": 15110 }, { "epoch": 1.9222745197811983, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.32365417480469, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.856243371963501, "num_tokens": 576379582.0, "step": 15111 }, { "epoch": 1.9224017300597889, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.8466796875, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8760843873023987, "num_tokens": 576414590.0, "step": 15112 }, { "epoch": 1.9225289403383794, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.204708099365234, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8514503836631775, "num_tokens": 576445023.0, "step": 15113 }, { "epoch": 1.92265615061697, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.44083786010742, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8554552793502808, "num_tokens": 576477861.0, "step": 15114 }, { "epoch": 1.9227833608955605, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.08384704589844, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8571784496307373, "num_tokens": 576513121.0, "step": 15115 }, { "epoch": 1.922910571174151, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.763816833496094, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8663867712020874, "num_tokens": 576551039.0, "step": 15116 }, { "epoch": 1.9230377814527415, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.32759475708008, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8539975881576538, "num_tokens": 576590651.0, "step": 15117 }, { "epoch": 1.923164991731332, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.940399169921875, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8514249920845032, "num_tokens": 576629119.0, "step": 15118 }, { "epoch": 1.9232922020099223, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.383544921875, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8706599473953247, "num_tokens": 576664240.0, "step": 15119 }, { "epoch": 1.9234194122885129, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.6195182800293, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.85586017370224, "num_tokens": 576701946.0, "step": 15120 }, { "epoch": 1.9235466225671034, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.70893096923828, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8719167113304138, "num_tokens": 576733541.0, "step": 15121 }, { "epoch": 1.923673832845694, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.44233322143555, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.846700131893158, "num_tokens": 576770930.0, "step": 15122 }, { "epoch": 1.9238010431242845, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.9768180847168, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8624168038368225, "num_tokens": 576807455.0, "step": 15123 }, { "epoch": 1.923928253402875, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.50117492675781, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8673349618911743, "num_tokens": 576849577.0, "step": 15124 }, { "epoch": 1.9240554636814655, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.958797454833984, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8529857397079468, "num_tokens": 576883506.0, "step": 15125 }, { "epoch": 1.9241826739600558, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.53367233276367, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.844304084777832, "num_tokens": 576923892.0, "step": 15126 }, { "epoch": 1.9243098842386464, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.75504684448242, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8781906366348267, "num_tokens": 576970832.0, "step": 15127 }, { "epoch": 1.9244370945172369, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.3940315246582, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8422548174858093, "num_tokens": 577006990.0, "step": 15128 }, { "epoch": 1.9245643047958274, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.747711181640625, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8655507564544678, "num_tokens": 577044932.0, "step": 15129 }, { "epoch": 1.924691515074418, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.53847122192383, "learning_rate": 1e-06, "loss": 0.6342, "mean_token_accuracy": 0.8438349962234497, "num_tokens": 577076217.0, "step": 15130 }, { "epoch": 1.9248187253530085, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.347408294677734, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8778650760650635, "num_tokens": 577112696.0, "step": 15131 }, { "epoch": 1.924945935631599, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.57857131958008, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.860445499420166, "num_tokens": 577156397.0, "step": 15132 }, { "epoch": 1.9250731459101895, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.67889404296875, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8642608523368835, "num_tokens": 577194192.0, "step": 15133 }, { "epoch": 1.92520035618878, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.517730712890625, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8553014993667603, "num_tokens": 577232639.0, "step": 15134 }, { "epoch": 1.9253275664673706, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.155452728271484, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.868553102016449, "num_tokens": 577267349.0, "step": 15135 }, { "epoch": 1.925454776745961, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.122039794921875, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.856844961643219, "num_tokens": 577308329.0, "step": 15136 }, { "epoch": 1.9255819870245516, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.11885452270508, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8623279333114624, "num_tokens": 577347825.0, "step": 15137 }, { "epoch": 1.9257091973031422, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.731109619140625, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8613609075546265, "num_tokens": 577379981.0, "step": 15138 }, { "epoch": 1.9258364075817327, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.84379577636719, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8648104667663574, "num_tokens": 577414134.0, "step": 15139 }, { "epoch": 1.9259636178603232, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.43330764770508, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8592082858085632, "num_tokens": 577451170.0, "step": 15140 }, { "epoch": 1.9260908281389137, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.107635498046875, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8449113368988037, "num_tokens": 577487447.0, "step": 15141 }, { "epoch": 1.9262180384175043, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.303890228271484, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8658244609832764, "num_tokens": 577525732.0, "step": 15142 }, { "epoch": 1.9263452486960948, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.97398376464844, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8590797185897827, "num_tokens": 577563303.0, "step": 15143 }, { "epoch": 1.926472458974685, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.21904754638672, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8749756813049316, "num_tokens": 577607592.0, "step": 15144 }, { "epoch": 1.9265996692532756, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.879756927490234, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8518638610839844, "num_tokens": 577651646.0, "step": 15145 }, { "epoch": 1.9267268795318662, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.619232177734375, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8689853549003601, "num_tokens": 577689530.0, "step": 15146 }, { "epoch": 1.9268540898104567, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.33962631225586, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8541852235794067, "num_tokens": 577732524.0, "step": 15147 }, { "epoch": 1.9269813000890472, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.89161682128906, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8649460077285767, "num_tokens": 577767164.0, "step": 15148 }, { "epoch": 1.9271085103676378, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.5836181640625, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8678712844848633, "num_tokens": 577802347.0, "step": 15149 }, { "epoch": 1.927235720646228, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.976749420166016, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8530014753341675, "num_tokens": 577842899.0, "step": 15150 }, { "epoch": 1.9273629309248186, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.530662536621094, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.860546350479126, "num_tokens": 577885137.0, "step": 15151 }, { "epoch": 1.9274901412034091, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.875457763671875, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8753395080566406, "num_tokens": 577925727.0, "step": 15152 }, { "epoch": 1.9276173514819996, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.602569580078125, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8646503686904907, "num_tokens": 577965399.0, "step": 15153 }, { "epoch": 1.9277445617605902, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.019351959228516, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.859827995300293, "num_tokens": 578007608.0, "step": 15154 }, { "epoch": 1.9278717720391807, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.204471588134766, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8710377216339111, "num_tokens": 578043425.0, "step": 15155 }, { "epoch": 1.9279989823177712, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.93403625488281, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8568723797798157, "num_tokens": 578075838.0, "step": 15156 }, { "epoch": 1.9281261925963618, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.10558319091797, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8566696643829346, "num_tokens": 578110113.0, "step": 15157 }, { "epoch": 1.9282534028749523, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.791019439697266, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8615955710411072, "num_tokens": 578145743.0, "step": 15158 }, { "epoch": 1.9283806131535428, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.906917572021484, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8674113154411316, "num_tokens": 578181588.0, "step": 15159 }, { "epoch": 1.9285078234321333, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.77454376220703, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8594831824302673, "num_tokens": 578222048.0, "step": 15160 }, { "epoch": 1.9286350337107239, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.108333587646484, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.863777756690979, "num_tokens": 578262664.0, "step": 15161 }, { "epoch": 1.9287622439893144, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.29195022583008, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8500010967254639, "num_tokens": 578303431.0, "step": 15162 }, { "epoch": 1.928889454267905, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.52933120727539, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8515374660491943, "num_tokens": 578338978.0, "step": 15163 }, { "epoch": 1.9290166645464955, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.64312744140625, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8566306233406067, "num_tokens": 578376340.0, "step": 15164 }, { "epoch": 1.929143874825086, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.41006088256836, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8623709678649902, "num_tokens": 578414714.0, "step": 15165 }, { "epoch": 1.9292710851036765, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.82583999633789, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8461889624595642, "num_tokens": 578453668.0, "step": 15166 }, { "epoch": 1.929398295382267, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.42667007446289, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8534854054450989, "num_tokens": 578494148.0, "step": 15167 }, { "epoch": 1.9295255056608573, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.87869644165039, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8403360843658447, "num_tokens": 578530743.0, "step": 15168 }, { "epoch": 1.9296527159394479, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.98725509643555, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8831105828285217, "num_tokens": 578567613.0, "step": 15169 }, { "epoch": 1.9297799262180384, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.90543746948242, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.85926353931427, "num_tokens": 578602981.0, "step": 15170 }, { "epoch": 1.929907136496629, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.351173400878906, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8542237281799316, "num_tokens": 578641027.0, "step": 15171 }, { "epoch": 1.9300343467752195, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.608455657958984, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8742215037345886, "num_tokens": 578683006.0, "step": 15172 }, { "epoch": 1.93016155705381, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.3495979309082, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8651605844497681, "num_tokens": 578716722.0, "step": 15173 }, { "epoch": 1.9302887673324005, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.94506072998047, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8415809869766235, "num_tokens": 578750740.0, "step": 15174 }, { "epoch": 1.9304159776109908, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.19700241088867, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8622644543647766, "num_tokens": 578791358.0, "step": 15175 }, { "epoch": 1.9305431878895813, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.214988708496094, "learning_rate": 1e-06, "loss": 0.6342, "mean_token_accuracy": 0.8439321517944336, "num_tokens": 578834514.0, "step": 15176 }, { "epoch": 1.9306703981681719, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.57744216918945, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8529312014579773, "num_tokens": 578875869.0, "step": 15177 }, { "epoch": 1.9307976084467624, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.26874923706055, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8635768890380859, "num_tokens": 578921079.0, "step": 15178 }, { "epoch": 1.930924818725353, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.68146896362305, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8585666418075562, "num_tokens": 578959837.0, "step": 15179 }, { "epoch": 1.9310520290039435, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.73274612426758, "learning_rate": 1e-06, "loss": 0.6837, "mean_token_accuracy": 0.8317323327064514, "num_tokens": 579001629.0, "step": 15180 }, { "epoch": 1.931179239282534, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.79161071777344, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8828456401824951, "num_tokens": 579036762.0, "step": 15181 }, { "epoch": 1.9313064495611245, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.04416275024414, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8386075496673584, "num_tokens": 579078455.0, "step": 15182 }, { "epoch": 1.931433659839715, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.926998138427734, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8636281490325928, "num_tokens": 579116942.0, "step": 15183 }, { "epoch": 1.9315608701183056, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.96687698364258, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8742567300796509, "num_tokens": 579155180.0, "step": 15184 }, { "epoch": 1.931688080396896, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.25328826904297, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8530678153038025, "num_tokens": 579194336.0, "step": 15185 }, { "epoch": 1.9318152906754866, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.39214324951172, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.854398250579834, "num_tokens": 579226609.0, "step": 15186 }, { "epoch": 1.9319425009540772, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.32053756713867, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8633416295051575, "num_tokens": 579259288.0, "step": 15187 }, { "epoch": 1.9320697112326677, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.640525817871094, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.867428719997406, "num_tokens": 579294797.0, "step": 15188 }, { "epoch": 1.9321969215112582, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.944068908691406, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8703838586807251, "num_tokens": 579334352.0, "step": 15189 }, { "epoch": 1.9323241317898487, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.120262145996094, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8500916361808777, "num_tokens": 579379618.0, "step": 15190 }, { "epoch": 1.9324513420684393, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.8731575012207, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8587198257446289, "num_tokens": 579420230.0, "step": 15191 }, { "epoch": 1.9325785523470298, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.79676055908203, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8603954315185547, "num_tokens": 579458236.0, "step": 15192 }, { "epoch": 1.93270576262562, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.853450775146484, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8600887656211853, "num_tokens": 579495561.0, "step": 15193 }, { "epoch": 1.9328329729042106, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.48973083496094, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8574812412261963, "num_tokens": 579530225.0, "step": 15194 }, { "epoch": 1.9329601831828012, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.6253547668457, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8517478108406067, "num_tokens": 579570143.0, "step": 15195 }, { "epoch": 1.9330873934613917, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.55173110961914, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8742787837982178, "num_tokens": 579607105.0, "step": 15196 }, { "epoch": 1.9332146037399822, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.27345275878906, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.874611496925354, "num_tokens": 579640018.0, "step": 15197 }, { "epoch": 1.9333418140185727, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.36398696899414, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.855309009552002, "num_tokens": 579681250.0, "step": 15198 }, { "epoch": 1.933469024297163, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.289093017578125, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8509348630905151, "num_tokens": 579718535.0, "step": 15199 }, { "epoch": 1.9335962345757536, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.45596694946289, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8477391600608826, "num_tokens": 579756959.0, "step": 15200 }, { "epoch": 1.933723444854344, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.33012390136719, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8527007102966309, "num_tokens": 579798266.0, "step": 15201 }, { "epoch": 1.9338506551329346, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.70530319213867, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8801108598709106, "num_tokens": 579831869.0, "step": 15202 }, { "epoch": 1.9339778654115252, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.981483459472656, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8613325953483582, "num_tokens": 579874168.0, "step": 15203 }, { "epoch": 1.9341050756901157, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.816429138183594, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.866286039352417, "num_tokens": 579916313.0, "step": 15204 }, { "epoch": 1.9342322859687062, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.741641998291016, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.872009813785553, "num_tokens": 579959022.0, "step": 15205 }, { "epoch": 1.9343594962472968, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.69649124145508, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8460509777069092, "num_tokens": 580003675.0, "step": 15206 }, { "epoch": 1.9344867065258873, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.057220458984375, "learning_rate": 1e-06, "loss": 0.5482, "mean_token_accuracy": 0.8703289031982422, "num_tokens": 580044384.0, "step": 15207 }, { "epoch": 1.9346139168044778, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.73997116088867, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8507565259933472, "num_tokens": 580078955.0, "step": 15208 }, { "epoch": 1.9347411270830683, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.27845001220703, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.859289288520813, "num_tokens": 580116793.0, "step": 15209 }, { "epoch": 1.9348683373616589, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.2153434753418, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8601070642471313, "num_tokens": 580153622.0, "step": 15210 }, { "epoch": 1.9349955476402494, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.48807144165039, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.858826756477356, "num_tokens": 580195287.0, "step": 15211 }, { "epoch": 1.93512275791884, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 43.959014892578125, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.8469744920730591, "num_tokens": 580229601.0, "step": 15212 }, { "epoch": 1.9352499681974304, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.648414611816406, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8636068105697632, "num_tokens": 580268079.0, "step": 15213 }, { "epoch": 1.935377178476021, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.456295013427734, "learning_rate": 1e-06, "loss": 0.6446, "mean_token_accuracy": 0.8433085083961487, "num_tokens": 580305379.0, "step": 15214 }, { "epoch": 1.9355043887546115, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.21237564086914, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8597553372383118, "num_tokens": 580338438.0, "step": 15215 }, { "epoch": 1.935631599033202, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.06853485107422, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8605430722236633, "num_tokens": 580370017.0, "step": 15216 }, { "epoch": 1.9357588093117923, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.35756301879883, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8723160028457642, "num_tokens": 580409572.0, "step": 15217 }, { "epoch": 1.9358860195903829, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.029727935791016, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8661403059959412, "num_tokens": 580448481.0, "step": 15218 }, { "epoch": 1.9360132298689734, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.97903823852539, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8716461658477783, "num_tokens": 580481919.0, "step": 15219 }, { "epoch": 1.936140440147564, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.29611587524414, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8553306460380554, "num_tokens": 580520758.0, "step": 15220 }, { "epoch": 1.9362676504261545, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.826751708984375, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8522159457206726, "num_tokens": 580559839.0, "step": 15221 }, { "epoch": 1.936394860704745, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.13970184326172, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8873580694198608, "num_tokens": 580591767.0, "step": 15222 }, { "epoch": 1.9365220709833355, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.83679962158203, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8686110973358154, "num_tokens": 580633180.0, "step": 15223 }, { "epoch": 1.9366492812619258, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.41606140136719, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8627946376800537, "num_tokens": 580677045.0, "step": 15224 }, { "epoch": 1.9367764915405163, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.606590270996094, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.873955488204956, "num_tokens": 580717423.0, "step": 15225 }, { "epoch": 1.9369037018191069, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.486358642578125, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8640850782394409, "num_tokens": 580756677.0, "step": 15226 }, { "epoch": 1.9370309120976974, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.463157653808594, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8647230863571167, "num_tokens": 580796993.0, "step": 15227 }, { "epoch": 1.937158122376288, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.24336624145508, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8629324436187744, "num_tokens": 580834616.0, "step": 15228 }, { "epoch": 1.9372853326548785, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 44.29859161376953, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8638632297515869, "num_tokens": 580873718.0, "step": 15229 }, { "epoch": 1.937412542933469, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.310279846191406, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8566744327545166, "num_tokens": 580911678.0, "step": 15230 }, { "epoch": 1.9375397532120595, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.38658142089844, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8669853210449219, "num_tokens": 580953282.0, "step": 15231 }, { "epoch": 1.93766696349065, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.8881721496582, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8669338226318359, "num_tokens": 580997954.0, "step": 15232 }, { "epoch": 1.9377941737692406, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.28504943847656, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8621888756752014, "num_tokens": 581035623.0, "step": 15233 }, { "epoch": 1.937921384047831, "ewc_loss": 0.1611328125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013828277587890625, "grad_norm": 45.276466369628906, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8715793490409851, "num_tokens": 581068727.0, "step": 15234 }, { "epoch": 1.9380485943264216, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.04747009277344, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8607511520385742, "num_tokens": 581111180.0, "step": 15235 }, { "epoch": 1.9381758046050122, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.23139190673828, "learning_rate": 1e-06, "loss": 0.6347, "mean_token_accuracy": 0.8421007394790649, "num_tokens": 581152373.0, "step": 15236 }, { "epoch": 1.9383030148836027, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.004398345947266, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8597400188446045, "num_tokens": 581195854.0, "step": 15237 }, { "epoch": 1.9384302251621932, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.166236877441406, "learning_rate": 1e-06, "loss": 0.6715, "mean_token_accuracy": 0.8331066370010376, "num_tokens": 581236414.0, "step": 15238 }, { "epoch": 1.9385574354407837, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.349369049072266, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8448431491851807, "num_tokens": 581275349.0, "step": 15239 }, { "epoch": 1.9386846457193743, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.1797981262207, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8683160543441772, "num_tokens": 581313231.0, "step": 15240 }, { "epoch": 1.9388118559979648, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.08072280883789, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8618000149726868, "num_tokens": 581355295.0, "step": 15241 }, { "epoch": 1.938939066276555, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.30622100830078, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8653279542922974, "num_tokens": 581392633.0, "step": 15242 }, { "epoch": 1.9390662765551456, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.143707275390625, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8741883039474487, "num_tokens": 581425552.0, "step": 15243 }, { "epoch": 1.9391934868337362, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.3582878112793, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8483385443687439, "num_tokens": 581467707.0, "step": 15244 }, { "epoch": 1.9393206971123267, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.58161163330078, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8669946193695068, "num_tokens": 581504888.0, "step": 15245 }, { "epoch": 1.9394479073909172, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.02058029174805, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8602041602134705, "num_tokens": 581537800.0, "step": 15246 }, { "epoch": 1.9395751176695077, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.552181243896484, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.881436288356781, "num_tokens": 581570594.0, "step": 15247 }, { "epoch": 1.939702327948098, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.88956069946289, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8523189425468445, "num_tokens": 581610764.0, "step": 15248 }, { "epoch": 1.9398295382266886, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.49494552612305, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8607391119003296, "num_tokens": 581652341.0, "step": 15249 }, { "epoch": 1.939956748505279, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.124202728271484, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8533056974411011, "num_tokens": 581689978.0, "step": 15250 }, { "epoch": 1.9400839587838696, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.39702224731445, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8694539070129395, "num_tokens": 581728811.0, "step": 15251 }, { "epoch": 1.9402111690624602, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 45.087894439697266, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8708233833312988, "num_tokens": 581768016.0, "step": 15252 }, { "epoch": 1.9403383793410507, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.07503128051758, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8628346920013428, "num_tokens": 581805405.0, "step": 15253 }, { "epoch": 1.9404655896196412, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.434268951416016, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8411716818809509, "num_tokens": 581845302.0, "step": 15254 }, { "epoch": 1.9405927998982317, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.29998779296875, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8699973225593567, "num_tokens": 581875391.0, "step": 15255 }, { "epoch": 1.9407200101768223, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.92615509033203, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8597738742828369, "num_tokens": 581916580.0, "step": 15256 }, { "epoch": 1.9408472204554128, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.83336639404297, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8619371652603149, "num_tokens": 581958539.0, "step": 15257 }, { "epoch": 1.9409744307340033, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.26627731323242, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8596861958503723, "num_tokens": 581996981.0, "step": 15258 }, { "epoch": 1.9411016410125939, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.2534065246582, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.870476245880127, "num_tokens": 582036068.0, "step": 15259 }, { "epoch": 1.9412288512911844, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.05126953125, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8620857000350952, "num_tokens": 582073322.0, "step": 15260 }, { "epoch": 1.941356061569775, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.453102111816406, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8642181158065796, "num_tokens": 582110736.0, "step": 15261 }, { "epoch": 1.9414832718483654, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.35178756713867, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8437147736549377, "num_tokens": 582153431.0, "step": 15262 }, { "epoch": 1.941610482126956, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.59478759765625, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8425903916358948, "num_tokens": 582191793.0, "step": 15263 }, { "epoch": 1.9417376924055465, "ewc_loss": 0.162109375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001392364501953125, "grad_norm": 44.53189468383789, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8521162271499634, "num_tokens": 582227923.0, "step": 15264 }, { "epoch": 1.941864902684137, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.069400787353516, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8445945978164673, "num_tokens": 582268029.0, "step": 15265 }, { "epoch": 1.9419921129627273, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.129032135009766, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8546733856201172, "num_tokens": 582302173.0, "step": 15266 }, { "epoch": 1.9421193232413179, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.306312561035156, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8767260313034058, "num_tokens": 582341642.0, "step": 15267 }, { "epoch": 1.9422465335199084, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.48921585083008, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8634331226348877, "num_tokens": 582378960.0, "step": 15268 }, { "epoch": 1.942373743798499, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.509849548339844, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8683422803878784, "num_tokens": 582422149.0, "step": 15269 }, { "epoch": 1.9425009540770894, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.41350555419922, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.845504879951477, "num_tokens": 582462657.0, "step": 15270 }, { "epoch": 1.94262816435568, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.686065673828125, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8734279870986938, "num_tokens": 582499264.0, "step": 15271 }, { "epoch": 1.9427553746342705, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.52640914916992, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8631687164306641, "num_tokens": 582534744.0, "step": 15272 }, { "epoch": 1.9428825849128608, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.385215759277344, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8563778400421143, "num_tokens": 582577010.0, "step": 15273 }, { "epoch": 1.9430097951914513, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.45093536376953, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8554248809814453, "num_tokens": 582617103.0, "step": 15274 }, { "epoch": 1.9431370054700419, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.08729553222656, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8529895544052124, "num_tokens": 582661754.0, "step": 15275 }, { "epoch": 1.9432642157486324, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.4623908996582, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8623533248901367, "num_tokens": 582701701.0, "step": 15276 }, { "epoch": 1.943391426027223, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.940982818603516, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8520417809486389, "num_tokens": 582741977.0, "step": 15277 }, { "epoch": 1.9435186363058135, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.514190673828125, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8712054491043091, "num_tokens": 582783420.0, "step": 15278 }, { "epoch": 1.943645846584404, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.15727615356445, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8683895468711853, "num_tokens": 582819464.0, "step": 15279 }, { "epoch": 1.9437730568629945, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.68439865112305, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8655601143836975, "num_tokens": 582860757.0, "step": 15280 }, { "epoch": 1.943900267141585, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.18107986450195, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8554286956787109, "num_tokens": 582910300.0, "step": 15281 }, { "epoch": 1.9440274774201756, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.424747467041016, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8544280529022217, "num_tokens": 582956333.0, "step": 15282 }, { "epoch": 1.944154687698766, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.317928314208984, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8686646223068237, "num_tokens": 582994068.0, "step": 15283 }, { "epoch": 1.9442818979773566, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.738807678222656, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8584002256393433, "num_tokens": 583027829.0, "step": 15284 }, { "epoch": 1.9444091082559471, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.24433135986328, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8591151833534241, "num_tokens": 583068881.0, "step": 15285 }, { "epoch": 1.9445363185345377, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.60393524169922, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8639125823974609, "num_tokens": 583100097.0, "step": 15286 }, { "epoch": 1.9446635288131282, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.589454650878906, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8598504066467285, "num_tokens": 583135340.0, "step": 15287 }, { "epoch": 1.9447907390917187, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.309303283691406, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8591765761375427, "num_tokens": 583172032.0, "step": 15288 }, { "epoch": 1.9449179493703093, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.453590393066406, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8638654947280884, "num_tokens": 583205244.0, "step": 15289 }, { "epoch": 1.9450451596488998, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.5561408996582, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8556184768676758, "num_tokens": 583247537.0, "step": 15290 }, { "epoch": 1.94517236992749, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.410552978515625, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8554209470748901, "num_tokens": 583285349.0, "step": 15291 }, { "epoch": 1.9452995802060806, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.24787139892578, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8634278774261475, "num_tokens": 583323810.0, "step": 15292 }, { "epoch": 1.9454267904846712, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.213565826416016, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.873691201210022, "num_tokens": 583354244.0, "step": 15293 }, { "epoch": 1.9455540007632617, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.81460952758789, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8760120868682861, "num_tokens": 583391764.0, "step": 15294 }, { "epoch": 1.9456812110418522, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.53890609741211, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8487226963043213, "num_tokens": 583426305.0, "step": 15295 }, { "epoch": 1.9458084213204427, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.98837661743164, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8567917346954346, "num_tokens": 583465252.0, "step": 15296 }, { "epoch": 1.945935631599033, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.4758415222168, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.852156400680542, "num_tokens": 583510001.0, "step": 15297 }, { "epoch": 1.9460628418776236, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.138858795166016, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8621371984481812, "num_tokens": 583547380.0, "step": 15298 }, { "epoch": 1.946190052156214, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.383583068847656, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8626469373703003, "num_tokens": 583588852.0, "step": 15299 }, { "epoch": 1.9463172624348046, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.13481140136719, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8675792217254639, "num_tokens": 583634006.0, "step": 15300 }, { "epoch": 1.9464444727133952, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.454349517822266, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8458835482597351, "num_tokens": 583679430.0, "step": 15301 }, { "epoch": 1.9465716829919857, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.06221008300781, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8665107488632202, "num_tokens": 583708856.0, "step": 15302 }, { "epoch": 1.9466988932705762, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.324153900146484, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8586349487304688, "num_tokens": 583750737.0, "step": 15303 }, { "epoch": 1.9468261035491667, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 44.803646087646484, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8656641244888306, "num_tokens": 583786342.0, "step": 15304 }, { "epoch": 1.9469533138277573, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.42879867553711, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8663204908370972, "num_tokens": 583821681.0, "step": 15305 }, { "epoch": 1.9470805241063478, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.086936950683594, "learning_rate": 1e-06, "loss": 0.6372, "mean_token_accuracy": 0.8414040207862854, "num_tokens": 583855640.0, "step": 15306 }, { "epoch": 1.9472077343849383, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.81239318847656, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8703852891921997, "num_tokens": 583888047.0, "step": 15307 }, { "epoch": 1.9473349446635289, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.53240203857422, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8591276407241821, "num_tokens": 583923220.0, "step": 15308 }, { "epoch": 1.9474621549421194, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.92034149169922, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8559021949768066, "num_tokens": 583965718.0, "step": 15309 }, { "epoch": 1.94758936522071, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.49076843261719, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8653895854949951, "num_tokens": 584014027.0, "step": 15310 }, { "epoch": 1.9477165754993004, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.313716888427734, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8638507127761841, "num_tokens": 584045019.0, "step": 15311 }, { "epoch": 1.947843785777891, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.50138854980469, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8448630571365356, "num_tokens": 584086839.0, "step": 15312 }, { "epoch": 1.9479709960564815, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.23732376098633, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8761225938796997, "num_tokens": 584123785.0, "step": 15313 }, { "epoch": 1.948098206335072, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.55462646484375, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8514212369918823, "num_tokens": 584163768.0, "step": 15314 }, { "epoch": 1.9482254166136623, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.2211799621582, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8572719693183899, "num_tokens": 584201668.0, "step": 15315 }, { "epoch": 1.9483526268922529, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.24901580810547, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.8688603639602661, "num_tokens": 584236626.0, "step": 15316 }, { "epoch": 1.9484798371708434, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.24659729003906, "learning_rate": 1e-06, "loss": 0.6372, "mean_token_accuracy": 0.8444247245788574, "num_tokens": 584279811.0, "step": 15317 }, { "epoch": 1.948607047449434, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.33363723754883, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8611372709274292, "num_tokens": 584317314.0, "step": 15318 }, { "epoch": 1.9487342577280244, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.40962600708008, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8671256303787231, "num_tokens": 584355768.0, "step": 15319 }, { "epoch": 1.948861468006615, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.60905075073242, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8576196432113647, "num_tokens": 584386205.0, "step": 15320 }, { "epoch": 1.9489886782852053, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.19116973876953, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8718711137771606, "num_tokens": 584419721.0, "step": 15321 }, { "epoch": 1.9491158885637958, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.978126525878906, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8575180768966675, "num_tokens": 584451029.0, "step": 15322 }, { "epoch": 1.9492430988423863, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.8200569152832, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8740439414978027, "num_tokens": 584483991.0, "step": 15323 }, { "epoch": 1.9493703091209769, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.86178207397461, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8585443496704102, "num_tokens": 584517652.0, "step": 15324 }, { "epoch": 1.9494975193995674, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.184017181396484, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8689424395561218, "num_tokens": 584561058.0, "step": 15325 }, { "epoch": 1.949624729678158, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.40891647338867, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8639557957649231, "num_tokens": 584595097.0, "step": 15326 }, { "epoch": 1.9497519399567484, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.2447624206543, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.867751955986023, "num_tokens": 584639073.0, "step": 15327 }, { "epoch": 1.949879150235339, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.595489501953125, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8510876893997192, "num_tokens": 584680517.0, "step": 15328 }, { "epoch": 1.9500063605139295, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.070777893066406, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8594724535942078, "num_tokens": 584717796.0, "step": 15329 }, { "epoch": 1.95013357079252, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.53498840332031, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8742501735687256, "num_tokens": 584755595.0, "step": 15330 }, { "epoch": 1.9502607810711106, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.00802993774414, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8500680923461914, "num_tokens": 584798681.0, "step": 15331 }, { "epoch": 1.950387991349701, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.440673828125, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8629505038261414, "num_tokens": 584835424.0, "step": 15332 }, { "epoch": 1.9505152016282916, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.77428436279297, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.873979926109314, "num_tokens": 584872063.0, "step": 15333 }, { "epoch": 1.9506424119068821, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.5221061706543, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8771746158599854, "num_tokens": 584905850.0, "step": 15334 }, { "epoch": 1.9507696221854727, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.72454833984375, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8707022666931152, "num_tokens": 584946480.0, "step": 15335 }, { "epoch": 1.9508968324640632, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.758052825927734, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8676831126213074, "num_tokens": 584977382.0, "step": 15336 }, { "epoch": 1.9510240427426537, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.82326126098633, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8641341924667358, "num_tokens": 585010885.0, "step": 15337 }, { "epoch": 1.9511512530212443, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.33881378173828, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8577608466148376, "num_tokens": 585048208.0, "step": 15338 }, { "epoch": 1.9512784632998348, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.16877746582031, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8566677570343018, "num_tokens": 585087477.0, "step": 15339 }, { "epoch": 1.951405673578425, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.24810791015625, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8594012260437012, "num_tokens": 585120860.0, "step": 15340 }, { "epoch": 1.9515328838570156, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.081626892089844, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8639280796051025, "num_tokens": 585161887.0, "step": 15341 }, { "epoch": 1.9516600941356061, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.69780349731445, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8699511289596558, "num_tokens": 585200496.0, "step": 15342 }, { "epoch": 1.9517873044141967, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.62815475463867, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.861762285232544, "num_tokens": 585243976.0, "step": 15343 }, { "epoch": 1.9519145146927872, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.85760498046875, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.856676459312439, "num_tokens": 585282366.0, "step": 15344 }, { "epoch": 1.9520417249713777, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.670654296875, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8614429235458374, "num_tokens": 585325224.0, "step": 15345 }, { "epoch": 1.952168935249968, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.234310150146484, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8705294132232666, "num_tokens": 585361954.0, "step": 15346 }, { "epoch": 1.9522961455285586, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.7392578125, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.858534574508667, "num_tokens": 585406033.0, "step": 15347 }, { "epoch": 1.952423355807149, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.84861755371094, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8595484495162964, "num_tokens": 585440457.0, "step": 15348 }, { "epoch": 1.9525505660857396, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.08270263671875, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8832293748855591, "num_tokens": 585473451.0, "step": 15349 }, { "epoch": 1.9526777763643302, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.86328887939453, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8655166625976562, "num_tokens": 585509652.0, "step": 15350 }, { "epoch": 1.9528049866429207, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.073394775390625, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8643022775650024, "num_tokens": 585545591.0, "step": 15351 }, { "epoch": 1.9529321969215112, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.53273391723633, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8503351807594299, "num_tokens": 585583757.0, "step": 15352 }, { "epoch": 1.9530594072001017, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.055545806884766, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8497109413146973, "num_tokens": 585623897.0, "step": 15353 }, { "epoch": 1.9531866174786923, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.26483154296875, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8647347688674927, "num_tokens": 585661043.0, "step": 15354 }, { "epoch": 1.9533138277572828, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.041812896728516, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8350731730461121, "num_tokens": 585699201.0, "step": 15355 }, { "epoch": 1.9534410380358733, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.07518005371094, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8530641794204712, "num_tokens": 585735325.0, "step": 15356 }, { "epoch": 1.9535682483144639, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.17232894897461, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.868837833404541, "num_tokens": 585769909.0, "step": 15357 }, { "epoch": 1.9536954585930544, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.847679138183594, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8518736362457275, "num_tokens": 585804590.0, "step": 15358 }, { "epoch": 1.953822668871645, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.12589645385742, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8583653569221497, "num_tokens": 585843349.0, "step": 15359 }, { "epoch": 1.9539498791502354, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.831302642822266, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.8745315074920654, "num_tokens": 585881846.0, "step": 15360 }, { "epoch": 1.954077089428826, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.208927154541016, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8582042455673218, "num_tokens": 585917848.0, "step": 15361 }, { "epoch": 1.9542042997074165, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.55922317504883, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8674864172935486, "num_tokens": 585961249.0, "step": 15362 }, { "epoch": 1.954331509986007, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.06636428833008, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8697681427001953, "num_tokens": 586005795.0, "step": 15363 }, { "epoch": 1.9544587202645973, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.729915618896484, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8703372478485107, "num_tokens": 586041523.0, "step": 15364 }, { "epoch": 1.9545859305431879, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.14788818359375, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.852373480796814, "num_tokens": 586077800.0, "step": 15365 }, { "epoch": 1.9547131408217784, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.10185241699219, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.857373058795929, "num_tokens": 586116010.0, "step": 15366 }, { "epoch": 1.954840351100369, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.925907135009766, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8741466999053955, "num_tokens": 586155359.0, "step": 15367 }, { "epoch": 1.9549675613789594, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.935585021972656, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.865757942199707, "num_tokens": 586188190.0, "step": 15368 }, { "epoch": 1.95509477165755, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.22914123535156, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8607333898544312, "num_tokens": 586225971.0, "step": 15369 }, { "epoch": 1.9552219819361403, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.6309928894043, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8572341203689575, "num_tokens": 586263295.0, "step": 15370 }, { "epoch": 1.9553491922147308, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.144248962402344, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8833114504814148, "num_tokens": 586293547.0, "step": 15371 }, { "epoch": 1.9554764024933213, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.53554916381836, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.857440710067749, "num_tokens": 586333180.0, "step": 15372 }, { "epoch": 1.9556036127719119, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.91522216796875, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8625006675720215, "num_tokens": 586368780.0, "step": 15373 }, { "epoch": 1.9557308230505024, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.4447021484375, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8453243374824524, "num_tokens": 586404177.0, "step": 15374 }, { "epoch": 1.955858033329093, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.87854766845703, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8477916121482849, "num_tokens": 586446917.0, "step": 15375 }, { "epoch": 1.9559852436076834, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.90265655517578, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8606957197189331, "num_tokens": 586485832.0, "step": 15376 }, { "epoch": 1.956112453886274, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.03975296020508, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8716261386871338, "num_tokens": 586524089.0, "step": 15377 }, { "epoch": 1.9562396641648645, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.96033477783203, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.86591637134552, "num_tokens": 586564097.0, "step": 15378 }, { "epoch": 1.956366874443455, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.04597091674805, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8730419874191284, "num_tokens": 586600653.0, "step": 15379 }, { "epoch": 1.9564940847220456, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.285728454589844, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8675901889801025, "num_tokens": 586639527.0, "step": 15380 }, { "epoch": 1.956621295000636, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.71182632446289, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8652915954589844, "num_tokens": 586670885.0, "step": 15381 }, { "epoch": 1.9567485052792266, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.00959777832031, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8756728172302246, "num_tokens": 586709875.0, "step": 15382 }, { "epoch": 1.9568757155578171, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.32814407348633, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8551098108291626, "num_tokens": 586753164.0, "step": 15383 }, { "epoch": 1.9570029258364077, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.7020378112793, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8694902062416077, "num_tokens": 586793737.0, "step": 15384 }, { "epoch": 1.9571301361149982, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.00017547607422, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8578423261642456, "num_tokens": 586833428.0, "step": 15385 }, { "epoch": 1.9572573463935887, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.654422760009766, "learning_rate": 1e-06, "loss": 0.6538, "mean_token_accuracy": 0.8390138149261475, "num_tokens": 586871202.0, "step": 15386 }, { "epoch": 1.9573845566721793, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.195552825927734, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8723374605178833, "num_tokens": 586905537.0, "step": 15387 }, { "epoch": 1.9575117669507698, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.46920394897461, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8512616157531738, "num_tokens": 586940378.0, "step": 15388 }, { "epoch": 1.95763897722936, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.11361312866211, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.872372031211853, "num_tokens": 586978806.0, "step": 15389 }, { "epoch": 1.9577661875079506, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.537193298339844, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8557720184326172, "num_tokens": 587007367.0, "step": 15390 }, { "epoch": 1.9578933977865411, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.07709503173828, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8835059404373169, "num_tokens": 587040422.0, "step": 15391 }, { "epoch": 1.9580206080651317, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.95858383178711, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8803607225418091, "num_tokens": 587070238.0, "step": 15392 }, { "epoch": 1.9581478183437222, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.807830810546875, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8681044578552246, "num_tokens": 587105973.0, "step": 15393 }, { "epoch": 1.9582750286223127, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.010799407958984, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8753481507301331, "num_tokens": 587137975.0, "step": 15394 }, { "epoch": 1.958402238900903, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.646827697753906, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8584712147712708, "num_tokens": 587169176.0, "step": 15395 }, { "epoch": 1.9585294491794936, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.092979431152344, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8596704602241516, "num_tokens": 587205570.0, "step": 15396 }, { "epoch": 1.958656659458084, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.6598014831543, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8538667559623718, "num_tokens": 587246729.0, "step": 15397 }, { "epoch": 1.9587838697366746, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.98280334472656, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8654618263244629, "num_tokens": 587284537.0, "step": 15398 }, { "epoch": 1.9589110800152651, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.88899612426758, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8674258589744568, "num_tokens": 587324394.0, "step": 15399 }, { "epoch": 1.9590382902938557, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.580482482910156, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8698121905326843, "num_tokens": 587359935.0, "step": 15400 }, { "epoch": 1.9591655005724462, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.10292053222656, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8740482330322266, "num_tokens": 587402081.0, "step": 15401 }, { "epoch": 1.9592927108510367, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.28522872924805, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8602604269981384, "num_tokens": 587442939.0, "step": 15402 }, { "epoch": 1.9594199211296273, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.107547760009766, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8532429933547974, "num_tokens": 587479155.0, "step": 15403 }, { "epoch": 1.9595471314082178, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.606849670410156, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8497312664985657, "num_tokens": 587519285.0, "step": 15404 }, { "epoch": 1.9596743416868083, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.16493225097656, "learning_rate": 1e-06, "loss": 0.6321, "mean_token_accuracy": 0.8437159657478333, "num_tokens": 587563978.0, "step": 15405 }, { "epoch": 1.9598015519653988, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.47571563720703, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8562911748886108, "num_tokens": 587599052.0, "step": 15406 }, { "epoch": 1.9599287622439894, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.16624450683594, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.8654237985610962, "num_tokens": 587639980.0, "step": 15407 }, { "epoch": 1.96005597252258, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.92768859863281, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8570790886878967, "num_tokens": 587673836.0, "step": 15408 }, { "epoch": 1.9601831828011704, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.51115036010742, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8620755672454834, "num_tokens": 587712896.0, "step": 15409 }, { "epoch": 1.960310393079761, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.23955154418945, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8561621904373169, "num_tokens": 587747352.0, "step": 15410 }, { "epoch": 1.9604376033583515, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.22280502319336, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8605477809906006, "num_tokens": 587790234.0, "step": 15411 }, { "epoch": 1.960564813636942, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.360992431640625, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8754341006278992, "num_tokens": 587830185.0, "step": 15412 }, { "epoch": 1.9606920239155323, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.33579635620117, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8621193766593933, "num_tokens": 587866265.0, "step": 15413 }, { "epoch": 1.9608192341941229, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.897525787353516, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8728364706039429, "num_tokens": 587904956.0, "step": 15414 }, { "epoch": 1.9609464444727134, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.1014518737793, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8688778281211853, "num_tokens": 587942369.0, "step": 15415 }, { "epoch": 1.961073654751304, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.65717315673828, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.866182804107666, "num_tokens": 587979340.0, "step": 15416 }, { "epoch": 1.9612008650298944, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.32376480102539, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8576176166534424, "num_tokens": 588021812.0, "step": 15417 }, { "epoch": 1.961328075308485, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.693477630615234, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8589391708374023, "num_tokens": 588064978.0, "step": 15418 }, { "epoch": 1.9614552855870753, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.787506103515625, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.858083963394165, "num_tokens": 588098142.0, "step": 15419 }, { "epoch": 1.9615824958656658, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.313995361328125, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.86838299036026, "num_tokens": 588141808.0, "step": 15420 }, { "epoch": 1.9617097061442563, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.62324523925781, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8683381080627441, "num_tokens": 588178137.0, "step": 15421 }, { "epoch": 1.9618369164228469, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.40922164916992, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8551476001739502, "num_tokens": 588215646.0, "step": 15422 }, { "epoch": 1.9619641267014374, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.42743682861328, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8521435260772705, "num_tokens": 588251345.0, "step": 15423 }, { "epoch": 1.962091336980028, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.631656646728516, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8687759041786194, "num_tokens": 588291222.0, "step": 15424 }, { "epoch": 1.9622185472586184, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.31024932861328, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8516322374343872, "num_tokens": 588329480.0, "step": 15425 }, { "epoch": 1.962345757537209, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.08997344970703, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.853805661201477, "num_tokens": 588367143.0, "step": 15426 }, { "epoch": 1.9624729678157995, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.95439147949219, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8716528415679932, "num_tokens": 588408363.0, "step": 15427 }, { "epoch": 1.96260017809439, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.53782272338867, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8685041666030884, "num_tokens": 588445397.0, "step": 15428 }, { "epoch": 1.9627273883729806, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.670467376708984, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8675328493118286, "num_tokens": 588480436.0, "step": 15429 }, { "epoch": 1.962854598651571, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.1565055847168, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8617759943008423, "num_tokens": 588526918.0, "step": 15430 }, { "epoch": 1.9629818089301616, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.941314697265625, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8616666793823242, "num_tokens": 588561300.0, "step": 15431 }, { "epoch": 1.9631090192087521, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.410953521728516, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8525845408439636, "num_tokens": 588603393.0, "step": 15432 }, { "epoch": 1.9632362294873427, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.75248336791992, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8687964677810669, "num_tokens": 588643806.0, "step": 15433 }, { "epoch": 1.9633634397659332, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.78917694091797, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8610910177230835, "num_tokens": 588685090.0, "step": 15434 }, { "epoch": 1.9634906500445237, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.42494201660156, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8496557474136353, "num_tokens": 588720983.0, "step": 15435 }, { "epoch": 1.9636178603231143, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.657352447509766, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8552175760269165, "num_tokens": 588760143.0, "step": 15436 }, { "epoch": 1.9637450706017048, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.3653450012207, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8633575439453125, "num_tokens": 588796636.0, "step": 15437 }, { "epoch": 1.963872280880295, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.06240463256836, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8573654890060425, "num_tokens": 588831681.0, "step": 15438 }, { "epoch": 1.9639994911588856, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.90974426269531, "learning_rate": 1e-06, "loss": 0.5389, "mean_token_accuracy": 0.8726609945297241, "num_tokens": 588875408.0, "step": 15439 }, { "epoch": 1.9641267014374761, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.77272415161133, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8668081164360046, "num_tokens": 588913583.0, "step": 15440 }, { "epoch": 1.9642539117160667, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.352291107177734, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8716720342636108, "num_tokens": 588949833.0, "step": 15441 }, { "epoch": 1.9643811219946572, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.48889923095703, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8655673265457153, "num_tokens": 588985678.0, "step": 15442 }, { "epoch": 1.9645083322732477, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.575714111328125, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8600528240203857, "num_tokens": 589018276.0, "step": 15443 }, { "epoch": 1.964635542551838, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.4163818359375, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8604145050048828, "num_tokens": 589061900.0, "step": 15444 }, { "epoch": 1.9647627528304286, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.205562591552734, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8630270957946777, "num_tokens": 589102740.0, "step": 15445 }, { "epoch": 1.964889963109019, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.33598327636719, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8719016313552856, "num_tokens": 589135276.0, "step": 15446 }, { "epoch": 1.9650171733876096, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.49843215942383, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8620131611824036, "num_tokens": 589173390.0, "step": 15447 }, { "epoch": 1.9651443836662001, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.444923400878906, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8538051843643188, "num_tokens": 589214592.0, "step": 15448 }, { "epoch": 1.9652715939447907, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.17009353637695, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.862360417842865, "num_tokens": 589250682.0, "step": 15449 }, { "epoch": 1.9653988042233812, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.79963684082031, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8569158315658569, "num_tokens": 589288326.0, "step": 15450 }, { "epoch": 1.9655260145019717, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.43010711669922, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8598350882530212, "num_tokens": 589321632.0, "step": 15451 }, { "epoch": 1.9656532247805623, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.64209747314453, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8511323928833008, "num_tokens": 589361134.0, "step": 15452 }, { "epoch": 1.9657804350591528, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.1761474609375, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8649744987487793, "num_tokens": 589402338.0, "step": 15453 }, { "epoch": 1.9659076453377433, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.6424446105957, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8554607629776001, "num_tokens": 589441080.0, "step": 15454 }, { "epoch": 1.9660348556163338, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.833499908447266, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8590823411941528, "num_tokens": 589484019.0, "step": 15455 }, { "epoch": 1.9661620658949244, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.32120132446289, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.856511116027832, "num_tokens": 589525890.0, "step": 15456 }, { "epoch": 1.966289276173515, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.73917770385742, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8681269884109497, "num_tokens": 589560644.0, "step": 15457 }, { "epoch": 1.9664164864521054, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.352333068847656, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8814718723297119, "num_tokens": 589600863.0, "step": 15458 }, { "epoch": 1.966543696730696, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.058998107910156, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8611589670181274, "num_tokens": 589637948.0, "step": 15459 }, { "epoch": 1.9666709070092865, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.263336181640625, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8766607046127319, "num_tokens": 589678061.0, "step": 15460 }, { "epoch": 1.966798117287877, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.84004592895508, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8508567810058594, "num_tokens": 589718050.0, "step": 15461 }, { "epoch": 1.9669253275664673, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.34848403930664, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8644824624061584, "num_tokens": 589759206.0, "step": 15462 }, { "epoch": 1.9670525378450578, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.55754470825195, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8510416150093079, "num_tokens": 589800218.0, "step": 15463 }, { "epoch": 1.9671797481236484, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.363285064697266, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8626762628555298, "num_tokens": 589843105.0, "step": 15464 }, { "epoch": 1.967306958402239, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.53376770019531, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8527827262878418, "num_tokens": 589881560.0, "step": 15465 }, { "epoch": 1.9674341686808294, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.29315948486328, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.850387692451477, "num_tokens": 589922755.0, "step": 15466 }, { "epoch": 1.96756137895942, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.17366027832031, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8435853719711304, "num_tokens": 589964982.0, "step": 15467 }, { "epoch": 1.9676885892380103, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.65010070800781, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8485368490219116, "num_tokens": 590003371.0, "step": 15468 }, { "epoch": 1.9678157995166008, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.812808990478516, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8597724437713623, "num_tokens": 590048361.0, "step": 15469 }, { "epoch": 1.9679430097951913, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.8770637512207, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.857693612575531, "num_tokens": 590083141.0, "step": 15470 }, { "epoch": 1.9680702200737819, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.48543167114258, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8570438623428345, "num_tokens": 590128161.0, "step": 15471 }, { "epoch": 1.9681974303523724, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.86448287963867, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8781099319458008, "num_tokens": 590166101.0, "step": 15472 }, { "epoch": 1.968324640630963, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.37345886230469, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8607962131500244, "num_tokens": 590207643.0, "step": 15473 }, { "epoch": 1.9684518509095534, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.077701568603516, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8536520600318909, "num_tokens": 590247567.0, "step": 15474 }, { "epoch": 1.968579061188144, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.17092514038086, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8564913272857666, "num_tokens": 590287940.0, "step": 15475 }, { "epoch": 1.9687062714667345, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.42530822753906, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8561838865280151, "num_tokens": 590322558.0, "step": 15476 }, { "epoch": 1.968833481745325, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.757991790771484, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8599869608879089, "num_tokens": 590362913.0, "step": 15477 }, { "epoch": 1.9689606920239155, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.57428741455078, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8527123332023621, "num_tokens": 590405877.0, "step": 15478 }, { "epoch": 1.969087902302506, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.69933319091797, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.852031946182251, "num_tokens": 590445373.0, "step": 15479 }, { "epoch": 1.9692151125810966, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.44037628173828, "learning_rate": 1e-06, "loss": 0.6311, "mean_token_accuracy": 0.8504180908203125, "num_tokens": 590483319.0, "step": 15480 }, { "epoch": 1.9693423228596871, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.87277603149414, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.8478330969810486, "num_tokens": 590522433.0, "step": 15481 }, { "epoch": 1.9694695331382777, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.80182647705078, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.8653750419616699, "num_tokens": 590564342.0, "step": 15482 }, { "epoch": 1.9695967434168682, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.92982482910156, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8633992671966553, "num_tokens": 590594527.0, "step": 15483 }, { "epoch": 1.9697239536954587, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.833106994628906, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8728457689285278, "num_tokens": 590630573.0, "step": 15484 }, { "epoch": 1.9698511639740492, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.30947494506836, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8620116114616394, "num_tokens": 590669333.0, "step": 15485 }, { "epoch": 1.9699783742526398, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.24463653564453, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8775883913040161, "num_tokens": 590709884.0, "step": 15486 }, { "epoch": 1.97010558453123, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.060523986816406, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8676062822341919, "num_tokens": 590746109.0, "step": 15487 }, { "epoch": 1.9702327948098206, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.718990325927734, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.845695972442627, "num_tokens": 590787881.0, "step": 15488 }, { "epoch": 1.9703600050884111, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.84150314331055, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8463224172592163, "num_tokens": 590826861.0, "step": 15489 }, { "epoch": 1.9704872153670017, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.406742095947266, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8670064210891724, "num_tokens": 590863104.0, "step": 15490 }, { "epoch": 1.9706144256455922, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.94832229614258, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.868066668510437, "num_tokens": 590900582.0, "step": 15491 }, { "epoch": 1.9707416359241827, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.234169006347656, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8682807683944702, "num_tokens": 590937735.0, "step": 15492 }, { "epoch": 1.970868846202773, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.50674819946289, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8755310773849487, "num_tokens": 590973089.0, "step": 15493 }, { "epoch": 1.9709960564813636, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 44.75594711303711, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8475204110145569, "num_tokens": 591012117.0, "step": 15494 }, { "epoch": 1.971123266759954, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.67348861694336, "learning_rate": 1e-06, "loss": 0.6383, "mean_token_accuracy": 0.8433950543403625, "num_tokens": 591044978.0, "step": 15495 }, { "epoch": 1.9712504770385446, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.865177154541016, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8592466711997986, "num_tokens": 591085562.0, "step": 15496 }, { "epoch": 1.9713776873171351, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.429588317871094, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8506031036376953, "num_tokens": 591120140.0, "step": 15497 }, { "epoch": 1.9715048975957257, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.00141906738281, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8520135879516602, "num_tokens": 591153952.0, "step": 15498 }, { "epoch": 1.9716321078743162, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.19664764404297, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8785596489906311, "num_tokens": 591192983.0, "step": 15499 }, { "epoch": 1.9717593181529067, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.41017150878906, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8629023432731628, "num_tokens": 591226260.0, "step": 15500 }, { "epoch": 1.9718865284314973, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.92374038696289, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8468941450119019, "num_tokens": 591259361.0, "step": 15501 }, { "epoch": 1.9720137387100878, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.86063003540039, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8596429824829102, "num_tokens": 591299753.0, "step": 15502 }, { "epoch": 1.9721409489886783, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.268001556396484, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8627958297729492, "num_tokens": 591335008.0, "step": 15503 }, { "epoch": 1.9722681592672688, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.31717300415039, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8666965961456299, "num_tokens": 591373001.0, "step": 15504 }, { "epoch": 1.9723953695458594, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.29026794433594, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8635128736495972, "num_tokens": 591408637.0, "step": 15505 }, { "epoch": 1.97252257982445, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.37120056152344, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.855957567691803, "num_tokens": 591444908.0, "step": 15506 }, { "epoch": 1.9726497901030404, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.31355667114258, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8624233603477478, "num_tokens": 591484641.0, "step": 15507 }, { "epoch": 1.972777000381631, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.53187942504883, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8585923910140991, "num_tokens": 591521859.0, "step": 15508 }, { "epoch": 1.9729042106602215, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.99222183227539, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.860109269618988, "num_tokens": 591559124.0, "step": 15509 }, { "epoch": 1.973031420938812, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.48686599731445, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.858855128288269, "num_tokens": 591597170.0, "step": 15510 }, { "epoch": 1.9731586312174023, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.704349517822266, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8664901256561279, "num_tokens": 591634784.0, "step": 15511 }, { "epoch": 1.9732858414959928, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.89414596557617, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8660937547683716, "num_tokens": 591671904.0, "step": 15512 }, { "epoch": 1.9734130517745834, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.54231262207031, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8626097440719604, "num_tokens": 591712990.0, "step": 15513 }, { "epoch": 1.973540262053174, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.8814697265625, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8586689829826355, "num_tokens": 591748579.0, "step": 15514 }, { "epoch": 1.9736674723317644, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.491737365722656, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8551963567733765, "num_tokens": 591785865.0, "step": 15515 }, { "epoch": 1.973794682610355, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.122764587402344, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8693711161613464, "num_tokens": 591820087.0, "step": 15516 }, { "epoch": 1.9739218928889453, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.50994110107422, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.86186283826828, "num_tokens": 591859713.0, "step": 15517 }, { "epoch": 1.9740491031675358, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.31206130981445, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8696597814559937, "num_tokens": 591893142.0, "step": 15518 }, { "epoch": 1.9741763134461263, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.03190612792969, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8569600582122803, "num_tokens": 591931529.0, "step": 15519 }, { "epoch": 1.9743035237247168, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.148311614990234, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8659942150115967, "num_tokens": 591976248.0, "step": 15520 }, { "epoch": 1.9744307340033074, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.89216995239258, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8592462539672852, "num_tokens": 592018008.0, "step": 15521 }, { "epoch": 1.974557944281898, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.877098083496094, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8463973999023438, "num_tokens": 592062934.0, "step": 15522 }, { "epoch": 1.9746851545604884, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.90655517578125, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8591543436050415, "num_tokens": 592103129.0, "step": 15523 }, { "epoch": 1.974812364839079, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.85563659667969, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8571264743804932, "num_tokens": 592142116.0, "step": 15524 }, { "epoch": 1.9749395751176695, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.85944747924805, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8621851205825806, "num_tokens": 592182507.0, "step": 15525 }, { "epoch": 1.97506678539626, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.30488586425781, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8392143249511719, "num_tokens": 592225593.0, "step": 15526 }, { "epoch": 1.9751939956748505, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.35249710083008, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.885856568813324, "num_tokens": 592270676.0, "step": 15527 }, { "epoch": 1.975321205953441, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.14686584472656, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8804906606674194, "num_tokens": 592305537.0, "step": 15528 }, { "epoch": 1.9754484162320316, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.59440994262695, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8667116165161133, "num_tokens": 592338289.0, "step": 15529 }, { "epoch": 1.9755756265106221, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.657249450683594, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8619853854179382, "num_tokens": 592379189.0, "step": 15530 }, { "epoch": 1.9757028367892127, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.838905334472656, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8747979998588562, "num_tokens": 592421875.0, "step": 15531 }, { "epoch": 1.9758300470678032, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.51028823852539, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8422775268554688, "num_tokens": 592457438.0, "step": 15532 }, { "epoch": 1.9759572573463937, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.45770263671875, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8717230558395386, "num_tokens": 592493230.0, "step": 15533 }, { "epoch": 1.9760844676249842, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.558982849121094, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8456854820251465, "num_tokens": 592533656.0, "step": 15534 }, { "epoch": 1.9762116779035748, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.902286529541016, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8748186826705933, "num_tokens": 592571622.0, "step": 15535 }, { "epoch": 1.976338888182165, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.24470520019531, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8558155298233032, "num_tokens": 592613260.0, "step": 15536 }, { "epoch": 1.9764660984607556, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.995323181152344, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8585772514343262, "num_tokens": 592648872.0, "step": 15537 }, { "epoch": 1.9765933087393461, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.15647888183594, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8621723651885986, "num_tokens": 592682336.0, "step": 15538 }, { "epoch": 1.9767205190179367, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.717079162597656, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8755484223365784, "num_tokens": 592716412.0, "step": 15539 }, { "epoch": 1.9768477292965272, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.39360427856445, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8632140755653381, "num_tokens": 592753936.0, "step": 15540 }, { "epoch": 1.9769749395751177, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.3984489440918, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8611500263214111, "num_tokens": 592792731.0, "step": 15541 }, { "epoch": 1.977102149853708, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.00407409667969, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8641518950462341, "num_tokens": 592832035.0, "step": 15542 }, { "epoch": 1.9772293601322986, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 44.987754821777344, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8786516189575195, "num_tokens": 592869693.0, "step": 15543 }, { "epoch": 1.977356570410889, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.62444305419922, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8646829724311829, "num_tokens": 592909648.0, "step": 15544 }, { "epoch": 1.9774837806894796, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.37549591064453, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8644323945045471, "num_tokens": 592942539.0, "step": 15545 }, { "epoch": 1.9776109909680701, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.65887451171875, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8603459596633911, "num_tokens": 592983680.0, "step": 15546 }, { "epoch": 1.9777382012466607, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.4139289855957, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8516913652420044, "num_tokens": 593023161.0, "step": 15547 }, { "epoch": 1.9778654115252512, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.44412612915039, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8432444334030151, "num_tokens": 593066612.0, "step": 15548 }, { "epoch": 1.9779926218038417, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.30973815917969, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8488671779632568, "num_tokens": 593105224.0, "step": 15549 }, { "epoch": 1.9781198320824323, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.4543342590332, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.860453188419342, "num_tokens": 593143683.0, "step": 15550 }, { "epoch": 1.9782470423610228, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.81549835205078, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8465263247489929, "num_tokens": 593179827.0, "step": 15551 }, { "epoch": 1.9783742526396133, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.29710006713867, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8628889322280884, "num_tokens": 593220936.0, "step": 15552 }, { "epoch": 1.9785014629182038, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.74775695800781, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8567377924919128, "num_tokens": 593258503.0, "step": 15553 }, { "epoch": 1.9786286731967944, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.63820266723633, "learning_rate": 1e-06, "loss": 0.6604, "mean_token_accuracy": 0.8424810767173767, "num_tokens": 593293738.0, "step": 15554 }, { "epoch": 1.978755883475385, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.76210403442383, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8523753881454468, "num_tokens": 593338037.0, "step": 15555 }, { "epoch": 1.9788830937539754, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.216453552246094, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8455959558486938, "num_tokens": 593383509.0, "step": 15556 }, { "epoch": 1.979010304032566, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.9195556640625, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8646152019500732, "num_tokens": 593417734.0, "step": 15557 }, { "epoch": 1.9791375143111565, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.12067413330078, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8736895322799683, "num_tokens": 593454502.0, "step": 15558 }, { "epoch": 1.979264724589747, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.776729583740234, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8573716878890991, "num_tokens": 593498864.0, "step": 15559 }, { "epoch": 1.9793919348683373, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.18152618408203, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8578445911407471, "num_tokens": 593540125.0, "step": 15560 }, { "epoch": 1.9795191451469278, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.32065200805664, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8689651489257812, "num_tokens": 593566937.0, "step": 15561 }, { "epoch": 1.9796463554255184, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.83457946777344, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8553221821784973, "num_tokens": 593600109.0, "step": 15562 }, { "epoch": 1.979773565704109, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 57.93395233154297, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8611161112785339, "num_tokens": 593638982.0, "step": 15563 }, { "epoch": 1.9799007759826994, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.603736877441406, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8515293002128601, "num_tokens": 593677270.0, "step": 15564 }, { "epoch": 1.98002798626129, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.40055847167969, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8698764443397522, "num_tokens": 593714056.0, "step": 15565 }, { "epoch": 1.9801551965398803, "ewc_loss": 0.16015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001373291015625, "grad_norm": 44.35740280151367, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8487581014633179, "num_tokens": 593753516.0, "step": 15566 }, { "epoch": 1.9802824068184708, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.94776916503906, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8618174195289612, "num_tokens": 593789276.0, "step": 15567 }, { "epoch": 1.9804096170970613, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 45.3900260925293, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8644381761550903, "num_tokens": 593828112.0, "step": 15568 }, { "epoch": 1.9805368273756518, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.945011138916016, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8489860892295837, "num_tokens": 593866370.0, "step": 15569 }, { "epoch": 1.9806640376542424, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.69816970825195, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8661741018295288, "num_tokens": 593907863.0, "step": 15570 }, { "epoch": 1.980791247932833, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.46888732910156, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.872481107711792, "num_tokens": 593945254.0, "step": 15571 }, { "epoch": 1.9809184582114234, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.736968994140625, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8663250207901001, "num_tokens": 593987105.0, "step": 15572 }, { "epoch": 1.981045668490014, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.517887115478516, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8722612857818604, "num_tokens": 594024091.0, "step": 15573 }, { "epoch": 1.9811728787686045, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.476016998291016, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8718626499176025, "num_tokens": 594056018.0, "step": 15574 }, { "epoch": 1.981300089047195, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.97060012817383, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8655683994293213, "num_tokens": 594102297.0, "step": 15575 }, { "epoch": 1.9814272993257855, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 46.32232666015625, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8623446226119995, "num_tokens": 594140278.0, "step": 15576 }, { "epoch": 1.981554509604376, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.18252944946289, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8663275241851807, "num_tokens": 594175025.0, "step": 15577 }, { "epoch": 1.9816817198829666, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.04624557495117, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8605009317398071, "num_tokens": 594217906.0, "step": 15578 }, { "epoch": 1.9818089301615571, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.51820373535156, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.8437449932098389, "num_tokens": 594264331.0, "step": 15579 }, { "epoch": 1.9819361404401477, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.59945297241211, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8782495260238647, "num_tokens": 594300220.0, "step": 15580 }, { "epoch": 1.9820633507187382, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.97638702392578, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8672570586204529, "num_tokens": 594337849.0, "step": 15581 }, { "epoch": 1.9821905609973287, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.727115631103516, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8550513386726379, "num_tokens": 594379295.0, "step": 15582 }, { "epoch": 1.9823177712759192, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.837066650390625, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8579590320587158, "num_tokens": 594408350.0, "step": 15583 }, { "epoch": 1.9824449815545098, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.33546447753906, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8515056371688843, "num_tokens": 594455110.0, "step": 15584 }, { "epoch": 1.9825721918331, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.829933166503906, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8656290769577026, "num_tokens": 594499231.0, "step": 15585 }, { "epoch": 1.9826994021116906, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.95454788208008, "learning_rate": 1e-06, "loss": 0.6439, "mean_token_accuracy": 0.8433923125267029, "num_tokens": 594541478.0, "step": 15586 }, { "epoch": 1.9828266123902811, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.742027282714844, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8651447296142578, "num_tokens": 594579075.0, "step": 15587 }, { "epoch": 1.9829538226688717, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.290687561035156, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.878031849861145, "num_tokens": 594616189.0, "step": 15588 }, { "epoch": 1.9830810329474622, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.548526763916016, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8682421445846558, "num_tokens": 594656633.0, "step": 15589 }, { "epoch": 1.9832082432260527, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.17830276489258, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.842586100101471, "num_tokens": 594696833.0, "step": 15590 }, { "epoch": 1.983335453504643, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.19369125366211, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8493914604187012, "num_tokens": 594735138.0, "step": 15591 }, { "epoch": 1.9834626637832335, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.38221740722656, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8557237386703491, "num_tokens": 594769587.0, "step": 15592 }, { "epoch": 1.983589874061824, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.34967041015625, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8665046691894531, "num_tokens": 594803635.0, "step": 15593 }, { "epoch": 1.9837170843404146, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.49698257446289, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8771412372589111, "num_tokens": 594839872.0, "step": 15594 }, { "epoch": 1.9838442946190051, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.560630798339844, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8491638898849487, "num_tokens": 594879208.0, "step": 15595 }, { "epoch": 1.9839715048975957, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.040245056152344, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8678310513496399, "num_tokens": 594910377.0, "step": 15596 }, { "epoch": 1.9840987151761862, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.72329330444336, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8640619516372681, "num_tokens": 594956574.0, "step": 15597 }, { "epoch": 1.9842259254547767, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.48683547973633, "learning_rate": 1e-06, "loss": 0.6219, "mean_token_accuracy": 0.8473997712135315, "num_tokens": 594998036.0, "step": 15598 }, { "epoch": 1.9843531357333672, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.19445037841797, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8551176190376282, "num_tokens": 595031404.0, "step": 15599 }, { "epoch": 1.9844803460119578, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.7280158996582, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.869775116443634, "num_tokens": 595069800.0, "step": 15600 }, { "epoch": 1.9846075562905483, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.19740295410156, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8672285079956055, "num_tokens": 595110987.0, "step": 15601 }, { "epoch": 1.9847347665691388, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.98566436767578, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8656034469604492, "num_tokens": 595149192.0, "step": 15602 }, { "epoch": 1.9848619768477294, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.58474349975586, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8582012057304382, "num_tokens": 595182993.0, "step": 15603 }, { "epoch": 1.9849891871263199, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.50189971923828, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8593820333480835, "num_tokens": 595216177.0, "step": 15604 }, { "epoch": 1.9851163974049104, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.497859954833984, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8618906140327454, "num_tokens": 595253379.0, "step": 15605 }, { "epoch": 1.985243607683501, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.5688591003418, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8657152652740479, "num_tokens": 595292878.0, "step": 15606 }, { "epoch": 1.9853708179620915, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.91223907470703, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8634225726127625, "num_tokens": 595329082.0, "step": 15607 }, { "epoch": 1.985498028240682, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.31879806518555, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8540524244308472, "num_tokens": 595367246.0, "step": 15608 }, { "epoch": 1.9856252385192723, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.813045501708984, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8684390783309937, "num_tokens": 595404450.0, "step": 15609 }, { "epoch": 1.9857524487978628, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.28272247314453, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.874163031578064, "num_tokens": 595439718.0, "step": 15610 }, { "epoch": 1.9858796590764534, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.8094367980957, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8635217547416687, "num_tokens": 595473617.0, "step": 15611 }, { "epoch": 1.986006869355044, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.27094650268555, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.858437180519104, "num_tokens": 595516432.0, "step": 15612 }, { "epoch": 1.9861340796336344, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.65987777709961, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8638098835945129, "num_tokens": 595556278.0, "step": 15613 }, { "epoch": 1.986261289912225, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.57737350463867, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8671388626098633, "num_tokens": 595598713.0, "step": 15614 }, { "epoch": 1.9863885001908153, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.470069885253906, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8647589087486267, "num_tokens": 595630597.0, "step": 15615 }, { "epoch": 1.9865157104694058, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.34199905395508, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8796145915985107, "num_tokens": 595664163.0, "step": 15616 }, { "epoch": 1.9866429207479963, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.85809326171875, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8707160949707031, "num_tokens": 595698809.0, "step": 15617 }, { "epoch": 1.9867701310265868, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.18589401245117, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8555406332015991, "num_tokens": 595741166.0, "step": 15618 }, { "epoch": 1.9868973413051774, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.55628204345703, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.854953944683075, "num_tokens": 595787079.0, "step": 15619 }, { "epoch": 1.987024551583768, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.29278564453125, "learning_rate": 1e-06, "loss": 0.6461, "mean_token_accuracy": 0.8378057479858398, "num_tokens": 595829085.0, "step": 15620 }, { "epoch": 1.9871517618623584, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.873619079589844, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8589891195297241, "num_tokens": 595870646.0, "step": 15621 }, { "epoch": 1.987278972140949, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.303688049316406, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8515363931655884, "num_tokens": 595918473.0, "step": 15622 }, { "epoch": 1.9874061824195395, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.01205062866211, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8506177663803101, "num_tokens": 595954097.0, "step": 15623 }, { "epoch": 1.98753339269813, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.2880744934082, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8429213762283325, "num_tokens": 595990191.0, "step": 15624 }, { "epoch": 1.9876606029767205, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.96390151977539, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8543457388877869, "num_tokens": 596027109.0, "step": 15625 }, { "epoch": 1.987787813255311, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.54695510864258, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8696743249893188, "num_tokens": 596067930.0, "step": 15626 }, { "epoch": 1.9879150235339016, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.82456970214844, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8727422952651978, "num_tokens": 596105809.0, "step": 15627 }, { "epoch": 1.9880422338124921, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.193260192871094, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8667285442352295, "num_tokens": 596144220.0, "step": 15628 }, { "epoch": 1.9881694440910826, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.7725715637207, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.8382028341293335, "num_tokens": 596183825.0, "step": 15629 }, { "epoch": 1.9882966543696732, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.36349868774414, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8700050115585327, "num_tokens": 596232333.0, "step": 15630 }, { "epoch": 1.9884238646482637, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.55937194824219, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8717740178108215, "num_tokens": 596272138.0, "step": 15631 }, { "epoch": 1.9885510749268542, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.860172271728516, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.87208092212677, "num_tokens": 596310254.0, "step": 15632 }, { "epoch": 1.9886782852054448, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.52766036987305, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.858316957950592, "num_tokens": 596350396.0, "step": 15633 }, { "epoch": 1.988805495484035, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.320926666259766, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8661795854568481, "num_tokens": 596388809.0, "step": 15634 }, { "epoch": 1.9889327057626256, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.80963134765625, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8693876266479492, "num_tokens": 596427366.0, "step": 15635 }, { "epoch": 1.9890599160412161, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.446006774902344, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8528701663017273, "num_tokens": 596469999.0, "step": 15636 }, { "epoch": 1.9891871263198067, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.06674575805664, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8462234735488892, "num_tokens": 596506773.0, "step": 15637 }, { "epoch": 1.9893143365983972, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.12404251098633, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8643237352371216, "num_tokens": 596547440.0, "step": 15638 }, { "epoch": 1.9894415468769877, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.642120361328125, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8671988248825073, "num_tokens": 596580212.0, "step": 15639 }, { "epoch": 1.989568757155578, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.46704864501953, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.883097767829895, "num_tokens": 596611790.0, "step": 15640 }, { "epoch": 1.9896959674341685, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.639705657958984, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8776942491531372, "num_tokens": 596646984.0, "step": 15641 }, { "epoch": 1.989823177712759, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.34849166870117, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8594591021537781, "num_tokens": 596688362.0, "step": 15642 }, { "epoch": 1.9899503879913496, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.98465347290039, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8645797967910767, "num_tokens": 596725298.0, "step": 15643 }, { "epoch": 1.9900775982699401, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.44835662841797, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8665949106216431, "num_tokens": 596757400.0, "step": 15644 }, { "epoch": 1.9902048085485307, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.55694580078125, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8745269775390625, "num_tokens": 596790533.0, "step": 15645 }, { "epoch": 1.9903320188271212, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.755985260009766, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8564110994338989, "num_tokens": 596834467.0, "step": 15646 }, { "epoch": 1.9904592291057117, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.85824966430664, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8506830930709839, "num_tokens": 596873262.0, "step": 15647 }, { "epoch": 1.9905864393843022, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.67859649658203, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8634822964668274, "num_tokens": 596912833.0, "step": 15648 }, { "epoch": 1.9907136496628928, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.69329833984375, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8698192834854126, "num_tokens": 596953554.0, "step": 15649 }, { "epoch": 1.9908408599414833, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.769752502441406, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8565664291381836, "num_tokens": 596991206.0, "step": 15650 }, { "epoch": 1.9909680702200738, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.6960563659668, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8640933036804199, "num_tokens": 597027222.0, "step": 15651 }, { "epoch": 1.9910952804986644, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.03826141357422, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8723021745681763, "num_tokens": 597069040.0, "step": 15652 }, { "epoch": 1.9912224907772549, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 46.01165771484375, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8556889295578003, "num_tokens": 597112529.0, "step": 15653 }, { "epoch": 1.9913497010558454, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.84970474243164, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8622344732284546, "num_tokens": 597148304.0, "step": 15654 }, { "epoch": 1.991476911334436, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.86026382446289, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8703605532646179, "num_tokens": 597184244.0, "step": 15655 }, { "epoch": 1.9916041216130265, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 46.04344940185547, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8552334904670715, "num_tokens": 597218178.0, "step": 15656 }, { "epoch": 1.991731331891617, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.09341049194336, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.859832763671875, "num_tokens": 597259256.0, "step": 15657 }, { "epoch": 1.9918585421702073, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.30537414550781, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8728417158126831, "num_tokens": 597296584.0, "step": 15658 }, { "epoch": 1.9919857524487978, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 44.65053939819336, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8457876443862915, "num_tokens": 597341116.0, "step": 15659 }, { "epoch": 1.9921129627273884, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.493709564208984, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8594027757644653, "num_tokens": 597375595.0, "step": 15660 }, { "epoch": 1.9922401730059789, "ewc_loss": 0.1640625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000141143798828125, "grad_norm": 45.259857177734375, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8474966287612915, "num_tokens": 597419226.0, "step": 15661 }, { "epoch": 1.9923673832845694, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.60667037963867, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8428021669387817, "num_tokens": 597461242.0, "step": 15662 }, { "epoch": 1.99249459356316, "ewc_loss": 0.1630859375, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014019012451171875, "grad_norm": 44.74644470214844, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8613129258155823, "num_tokens": 597500030.0, "step": 15663 }, { "epoch": 1.9926218038417502, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.4257926940918, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.845784068107605, "num_tokens": 597540428.0, "step": 15664 }, { "epoch": 1.9927490141203408, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.27976989746094, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.872779369354248, "num_tokens": 597575070.0, "step": 15665 }, { "epoch": 1.9928762243989313, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.21852111816406, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.859926700592041, "num_tokens": 597617425.0, "step": 15666 }, { "epoch": 1.9930034346775218, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.68918228149414, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.868094801902771, "num_tokens": 597660489.0, "step": 15667 }, { "epoch": 1.9931306449561124, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.83003616333008, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.866279125213623, "num_tokens": 597703626.0, "step": 15668 }, { "epoch": 1.993257855234703, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.85057067871094, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8575006127357483, "num_tokens": 597741176.0, "step": 15669 }, { "epoch": 1.9933850655132934, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.672325134277344, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8477007150650024, "num_tokens": 597779763.0, "step": 15670 }, { "epoch": 1.993512275791884, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.6495361328125, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8660757541656494, "num_tokens": 597813300.0, "step": 15671 }, { "epoch": 1.9936394860704745, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.81550979614258, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8507093191146851, "num_tokens": 597849062.0, "step": 15672 }, { "epoch": 1.993766696349065, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.454166412353516, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8643084764480591, "num_tokens": 597886565.0, "step": 15673 }, { "epoch": 1.9938939066276555, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.51157760620117, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.864404559135437, "num_tokens": 597925097.0, "step": 15674 }, { "epoch": 1.994021116906246, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.17094802856445, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8743534088134766, "num_tokens": 597960786.0, "step": 15675 }, { "epoch": 1.9941483271848366, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.499515533447266, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8446700572967529, "num_tokens": 598000442.0, "step": 15676 }, { "epoch": 1.9942755374634271, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.61014938354492, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8594844341278076, "num_tokens": 598042946.0, "step": 15677 }, { "epoch": 1.9944027477420176, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.820194244384766, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8404241800308228, "num_tokens": 598081659.0, "step": 15678 }, { "epoch": 1.9945299580206082, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.283939361572266, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8554632663726807, "num_tokens": 598120203.0, "step": 15679 }, { "epoch": 1.9946571682991987, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.7338981628418, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8472661972045898, "num_tokens": 598154779.0, "step": 15680 }, { "epoch": 1.9947843785777892, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.24392318725586, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8614537715911865, "num_tokens": 598197075.0, "step": 15681 }, { "epoch": 1.9949115888563798, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.7385368347168, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8531707525253296, "num_tokens": 598240539.0, "step": 15682 }, { "epoch": 1.99503879913497, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.88898849487305, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8619019389152527, "num_tokens": 598281776.0, "step": 15683 }, { "epoch": 1.9951660094135606, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.319068908691406, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8683065176010132, "num_tokens": 598322715.0, "step": 15684 }, { "epoch": 1.9952932196921511, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.8717041015625, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8552378416061401, "num_tokens": 598364495.0, "step": 15685 }, { "epoch": 1.9954204299707416, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.358890533447266, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8585221767425537, "num_tokens": 598407102.0, "step": 15686 }, { "epoch": 1.9955476402493322, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.41507339477539, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8470712900161743, "num_tokens": 598448600.0, "step": 15687 }, { "epoch": 1.9956748505279227, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.69594192504883, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8516591191291809, "num_tokens": 598491140.0, "step": 15688 }, { "epoch": 1.995802060806513, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.42827606201172, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8563832640647888, "num_tokens": 598527831.0, "step": 15689 }, { "epoch": 1.9959292710851035, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.75008773803711, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8488443493843079, "num_tokens": 598568680.0, "step": 15690 }, { "epoch": 1.996056481363694, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.29835510253906, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8486044406890869, "num_tokens": 598606254.0, "step": 15691 }, { "epoch": 1.9961836916422846, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.65329360961914, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8639752268791199, "num_tokens": 598642400.0, "step": 15692 }, { "epoch": 1.9963109019208751, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.44779586791992, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8386003375053406, "num_tokens": 598678379.0, "step": 15693 }, { "epoch": 1.9964381121994657, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.6483154296875, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8597135543823242, "num_tokens": 598718234.0, "step": 15694 }, { "epoch": 1.9965653224780562, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.576229095458984, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8646427392959595, "num_tokens": 598754897.0, "step": 15695 }, { "epoch": 1.9966925327566467, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.423343658447266, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8553872108459473, "num_tokens": 598785268.0, "step": 15696 }, { "epoch": 1.9968197430352372, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.753421783447266, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8675224781036377, "num_tokens": 598827131.0, "step": 15697 }, { "epoch": 1.9969469533138278, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.62506103515625, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8620321750640869, "num_tokens": 598859507.0, "step": 15698 }, { "epoch": 1.9970741635924183, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.59202194213867, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8511824607849121, "num_tokens": 598893280.0, "step": 15699 }, { "epoch": 1.9972013738710088, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.579803466796875, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8745384812355042, "num_tokens": 598932759.0, "step": 15700 }, { "epoch": 1.9973285841495994, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.315284729003906, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8708041906356812, "num_tokens": 598972899.0, "step": 15701 }, { "epoch": 1.9974557944281899, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.77489471435547, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8633452653884888, "num_tokens": 599012811.0, "step": 15702 }, { "epoch": 1.9975830047067804, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.57635498046875, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8518210649490356, "num_tokens": 599055632.0, "step": 15703 }, { "epoch": 1.997710214985371, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.745155334472656, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8436356782913208, "num_tokens": 599090924.0, "step": 15704 }, { "epoch": 1.9978374252639615, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.802913665771484, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8748907446861267, "num_tokens": 599130434.0, "step": 15705 }, { "epoch": 1.997964635542552, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.70347213745117, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.860429584980011, "num_tokens": 599171018.0, "step": 15706 }, { "epoch": 1.9980918458211423, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.882991790771484, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.853229820728302, "num_tokens": 599212445.0, "step": 15707 }, { "epoch": 1.9982190560997328, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.575862884521484, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.863953709602356, "num_tokens": 599250591.0, "step": 15708 }, { "epoch": 1.9983462663783234, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.75413513183594, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8563628196716309, "num_tokens": 599284987.0, "step": 15709 }, { "epoch": 1.9984734766569139, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.911598205566406, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8560667037963867, "num_tokens": 599324656.0, "step": 15710 }, { "epoch": 1.9986006869355044, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 44.89763641357422, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8716761469841003, "num_tokens": 599364014.0, "step": 15711 }, { "epoch": 1.998727897214095, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.66954040527344, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8538925647735596, "num_tokens": 599396781.0, "step": 15712 }, { "epoch": 1.9988551074926852, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.36251449584961, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8711718916893005, "num_tokens": 599433240.0, "step": 15713 }, { "epoch": 1.9989823177712758, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.294857025146484, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8693861961364746, "num_tokens": 599474883.0, "step": 15714 }, { "epoch": 1.9991095280498663, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.686763763427734, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8630942106246948, "num_tokens": 599508492.0, "step": 15715 }, { "epoch": 1.9992367383284568, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.039798736572266, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8637495040893555, "num_tokens": 599545946.0, "step": 15716 }, { "epoch": 1.9993639486070474, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.82392883300781, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.874631404876709, "num_tokens": 599586217.0, "step": 15717 }, { "epoch": 1.9994911588856379, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.434444427490234, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8707536458969116, "num_tokens": 599620431.0, "step": 15718 }, { "epoch": 1.9996183691642284, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.44304656982422, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8458288908004761, "num_tokens": 599659959.0, "step": 15719 }, { "epoch": 1.999745579442819, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.00143051147461, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8588684797286987, "num_tokens": 599696558.0, "step": 15720 }, { "epoch": 1.9998727897214095, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.06281661987305, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8493431210517883, "num_tokens": 599734925.0, "step": 15721 }, { "epoch": 2.0, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.41359329223633, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8640867471694946, "num_tokens": 599772613.0, "step": 15722 }, { "epoch": 2.0001272102785905, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.89592742919922, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8641777038574219, "num_tokens": 599813892.0, "step": 15723 }, { "epoch": 2.000254420557181, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.56310272216797, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8616024851799011, "num_tokens": 599852807.0, "step": 15724 }, { "epoch": 2.0003816308357716, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.731605529785156, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8677763938903809, "num_tokens": 599893712.0, "step": 15725 }, { "epoch": 2.000508841114362, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.72346496582031, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8629530668258667, "num_tokens": 599929954.0, "step": 15726 }, { "epoch": 2.0006360513929526, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.72513198852539, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8602186441421509, "num_tokens": 599967763.0, "step": 15727 }, { "epoch": 2.000763261671543, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.45512771606445, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8707654476165771, "num_tokens": 600008938.0, "step": 15728 }, { "epoch": 2.0008904719501337, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.10572052001953, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8580398559570312, "num_tokens": 600051849.0, "step": 15729 }, { "epoch": 2.0010176822287242, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.39479064941406, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8686709403991699, "num_tokens": 600088568.0, "step": 15730 }, { "epoch": 2.0011448925073148, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.549407958984375, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8656628131866455, "num_tokens": 600131553.0, "step": 15731 }, { "epoch": 2.0012721027859053, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.46106719970703, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8623626828193665, "num_tokens": 600173198.0, "step": 15732 }, { "epoch": 2.001399313064496, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.56155776977539, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8659267425537109, "num_tokens": 600211347.0, "step": 15733 }, { "epoch": 2.0015265233430863, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.34825134277344, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8560240268707275, "num_tokens": 600249953.0, "step": 15734 }, { "epoch": 2.0016537336216764, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.68247604370117, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8536392450332642, "num_tokens": 600291211.0, "step": 15735 }, { "epoch": 2.001780943900267, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 46.09168243408203, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8656217455863953, "num_tokens": 600332906.0, "step": 15736 }, { "epoch": 2.0019081541788575, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.25013732910156, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8625117540359497, "num_tokens": 600370570.0, "step": 15737 }, { "epoch": 2.002035364457448, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.78810501098633, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8531774282455444, "num_tokens": 600409368.0, "step": 15738 }, { "epoch": 2.0021625747360385, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.543338775634766, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8664361238479614, "num_tokens": 600442019.0, "step": 15739 }, { "epoch": 2.002289785014629, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.00510025024414, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8700857758522034, "num_tokens": 600471665.0, "step": 15740 }, { "epoch": 2.0024169952932196, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.604557037353516, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8599752187728882, "num_tokens": 600508804.0, "step": 15741 }, { "epoch": 2.00254420557181, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.49601364135742, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8495655059814453, "num_tokens": 600543895.0, "step": 15742 }, { "epoch": 2.0026714158504006, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.21989822387695, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8641201257705688, "num_tokens": 600585818.0, "step": 15743 }, { "epoch": 2.002798626128991, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.25214767456055, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8637610673904419, "num_tokens": 600623106.0, "step": 15744 }, { "epoch": 2.0029258364075817, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.677825927734375, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8848689794540405, "num_tokens": 600662662.0, "step": 15745 }, { "epoch": 2.0030530466861722, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.542354583740234, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8483908176422119, "num_tokens": 600699416.0, "step": 15746 }, { "epoch": 2.0031802569647628, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.981319427490234, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8554388284683228, "num_tokens": 600739351.0, "step": 15747 }, { "epoch": 2.0033074672433533, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.52375793457031, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8639860153198242, "num_tokens": 600781198.0, "step": 15748 }, { "epoch": 2.003434677521944, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.86142349243164, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8591710329055786, "num_tokens": 600821743.0, "step": 15749 }, { "epoch": 2.0035618878005343, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.23445510864258, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8762630224227905, "num_tokens": 600862463.0, "step": 15750 }, { "epoch": 2.003689098079125, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.181663513183594, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8507648706436157, "num_tokens": 600902629.0, "step": 15751 }, { "epoch": 2.0038163083577154, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.60763931274414, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8587696552276611, "num_tokens": 600941778.0, "step": 15752 }, { "epoch": 2.003943518636306, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.95051574707031, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.8693399429321289, "num_tokens": 600979425.0, "step": 15753 }, { "epoch": 2.0040707289148965, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.07171630859375, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8579132556915283, "num_tokens": 601017882.0, "step": 15754 }, { "epoch": 2.004197939193487, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.48088836669922, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8566997647285461, "num_tokens": 601053394.0, "step": 15755 }, { "epoch": 2.0043251494720775, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.358985900878906, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8649386763572693, "num_tokens": 601094940.0, "step": 15756 }, { "epoch": 2.004452359750668, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.00946807861328, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.862812876701355, "num_tokens": 601132963.0, "step": 15757 }, { "epoch": 2.0045795700292586, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.65896987915039, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.859917163848877, "num_tokens": 601168511.0, "step": 15758 }, { "epoch": 2.0047067803078487, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.06819534301758, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8615692853927612, "num_tokens": 601208922.0, "step": 15759 }, { "epoch": 2.004833990586439, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.05155563354492, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8559731245040894, "num_tokens": 601253814.0, "step": 15760 }, { "epoch": 2.0049612008650297, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.76864242553711, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8644310235977173, "num_tokens": 601295584.0, "step": 15761 }, { "epoch": 2.0050884111436202, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.109474182128906, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8544227480888367, "num_tokens": 601329134.0, "step": 15762 }, { "epoch": 2.0052156214222108, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.235233306884766, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8714296817779541, "num_tokens": 601369100.0, "step": 15763 }, { "epoch": 2.0053428317008013, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.171775817871094, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8767932653427124, "num_tokens": 601407876.0, "step": 15764 }, { "epoch": 2.005470041979392, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.67070388793945, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8761054277420044, "num_tokens": 601446996.0, "step": 15765 }, { "epoch": 2.0055972522579824, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.25166702270508, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8524225950241089, "num_tokens": 601482158.0, "step": 15766 }, { "epoch": 2.005724462536573, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.37824249267578, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.869672417640686, "num_tokens": 601517940.0, "step": 15767 }, { "epoch": 2.0058516728151634, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.27412033081055, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8822875022888184, "num_tokens": 601553823.0, "step": 15768 }, { "epoch": 2.005978883093754, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.234371185302734, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8503373265266418, "num_tokens": 601595482.0, "step": 15769 }, { "epoch": 2.0061060933723445, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.407901763916016, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.869845986366272, "num_tokens": 601632451.0, "step": 15770 }, { "epoch": 2.006233303650935, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.626277923583984, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8599221706390381, "num_tokens": 601667573.0, "step": 15771 }, { "epoch": 2.0063605139295255, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.494869232177734, "learning_rate": 1e-06, "loss": 0.5535, "mean_token_accuracy": 0.869505763053894, "num_tokens": 601704494.0, "step": 15772 }, { "epoch": 2.006487724208116, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.11430358886719, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8521528840065002, "num_tokens": 601745307.0, "step": 15773 }, { "epoch": 2.0066149344867066, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.777462005615234, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8777163624763489, "num_tokens": 601782551.0, "step": 15774 }, { "epoch": 2.006742144765297, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 44.97724151611328, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8642314672470093, "num_tokens": 601823378.0, "step": 15775 }, { "epoch": 2.0068693550438876, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.610557556152344, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8619341850280762, "num_tokens": 601860881.0, "step": 15776 }, { "epoch": 2.006996565322478, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.042335510253906, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.861106276512146, "num_tokens": 601903502.0, "step": 15777 }, { "epoch": 2.0071237756010687, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.68455123901367, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8800610303878784, "num_tokens": 601937097.0, "step": 15778 }, { "epoch": 2.007250985879659, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.337440490722656, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8661158084869385, "num_tokens": 601978050.0, "step": 15779 }, { "epoch": 2.0073781961582498, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.2844123840332, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.861630916595459, "num_tokens": 602018167.0, "step": 15780 }, { "epoch": 2.0075054064368403, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.546485900878906, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.848076581954956, "num_tokens": 602054485.0, "step": 15781 }, { "epoch": 2.007632616715431, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.084293365478516, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8669090270996094, "num_tokens": 602092881.0, "step": 15782 }, { "epoch": 2.0077598269940213, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.639347076416016, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8638218641281128, "num_tokens": 602127424.0, "step": 15783 }, { "epoch": 2.0078870372726114, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.36654281616211, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8696269989013672, "num_tokens": 602168340.0, "step": 15784 }, { "epoch": 2.008014247551202, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.55983352661133, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8529881238937378, "num_tokens": 602207894.0, "step": 15785 }, { "epoch": 2.0081414578297925, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.149898529052734, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.864538848400116, "num_tokens": 602245924.0, "step": 15786 }, { "epoch": 2.008268668108383, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.52927017211914, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.861972987651825, "num_tokens": 602278417.0, "step": 15787 }, { "epoch": 2.0083958783869735, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.60398483276367, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8654369711875916, "num_tokens": 602318431.0, "step": 15788 }, { "epoch": 2.008523088665564, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.10670852661133, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8537753820419312, "num_tokens": 602356315.0, "step": 15789 }, { "epoch": 2.0086502989441546, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.37714385986328, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8766195774078369, "num_tokens": 602398865.0, "step": 15790 }, { "epoch": 2.008777509222745, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.62975311279297, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8721300959587097, "num_tokens": 602435835.0, "step": 15791 }, { "epoch": 2.0089047195013356, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.26087188720703, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8703019618988037, "num_tokens": 602470359.0, "step": 15792 }, { "epoch": 2.009031929779926, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.62105941772461, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8665356040000916, "num_tokens": 602512132.0, "step": 15793 }, { "epoch": 2.0091591400585167, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.053489685058594, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8596744537353516, "num_tokens": 602552700.0, "step": 15794 }, { "epoch": 2.0092863503371072, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.5975341796875, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8621978163719177, "num_tokens": 602587779.0, "step": 15795 }, { "epoch": 2.0094135606156978, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.79039001464844, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8525434732437134, "num_tokens": 602623834.0, "step": 15796 }, { "epoch": 2.0095407708942883, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.964630126953125, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8751476407051086, "num_tokens": 602660141.0, "step": 15797 }, { "epoch": 2.009667981172879, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.74706268310547, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.867652416229248, "num_tokens": 602698990.0, "step": 15798 }, { "epoch": 2.0097951914514693, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.48612976074219, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8433471918106079, "num_tokens": 602732053.0, "step": 15799 }, { "epoch": 2.00992240173006, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.592647552490234, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8638597130775452, "num_tokens": 602770975.0, "step": 15800 }, { "epoch": 2.0100496120086504, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.187225341796875, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8541277050971985, "num_tokens": 602813207.0, "step": 15801 }, { "epoch": 2.010176822287241, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.76803207397461, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8679182529449463, "num_tokens": 602849951.0, "step": 15802 }, { "epoch": 2.0103040325658315, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.90367126464844, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8534730672836304, "num_tokens": 602891418.0, "step": 15803 }, { "epoch": 2.010431242844422, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.036434173583984, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8696708083152771, "num_tokens": 602933039.0, "step": 15804 }, { "epoch": 2.0105584531230125, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.7432975769043, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8781057596206665, "num_tokens": 602965546.0, "step": 15805 }, { "epoch": 2.010685663401603, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.94666290283203, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8632248640060425, "num_tokens": 603004473.0, "step": 15806 }, { "epoch": 2.0108128736801936, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.766666412353516, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8551304340362549, "num_tokens": 603041644.0, "step": 15807 }, { "epoch": 2.0109400839587837, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.272987365722656, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8506098389625549, "num_tokens": 603084698.0, "step": 15808 }, { "epoch": 2.011067294237374, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.083831787109375, "learning_rate": 1e-06, "loss": 0.631, "mean_token_accuracy": 0.8456379175186157, "num_tokens": 603121059.0, "step": 15809 }, { "epoch": 2.0111945045159647, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.57688522338867, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8613938093185425, "num_tokens": 603156604.0, "step": 15810 }, { "epoch": 2.0113217147945552, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.926883697509766, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8629071712493896, "num_tokens": 603183743.0, "step": 15811 }, { "epoch": 2.0114489250731458, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.26204299926758, "learning_rate": 1e-06, "loss": 0.6482, "mean_token_accuracy": 0.8410475254058838, "num_tokens": 603222099.0, "step": 15812 }, { "epoch": 2.0115761353517363, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.3038444519043, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8431872129440308, "num_tokens": 603263467.0, "step": 15813 }, { "epoch": 2.011703345630327, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.2481803894043, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8430479764938354, "num_tokens": 603296726.0, "step": 15814 }, { "epoch": 2.0118305559089174, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 44.816802978515625, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8686232566833496, "num_tokens": 603329691.0, "step": 15815 }, { "epoch": 2.011957766187508, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.94096374511719, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8705867528915405, "num_tokens": 603371619.0, "step": 15816 }, { "epoch": 2.0120849764660984, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.229000091552734, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8654630780220032, "num_tokens": 603400018.0, "step": 15817 }, { "epoch": 2.012212186744689, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.54762649536133, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8660633563995361, "num_tokens": 603436971.0, "step": 15818 }, { "epoch": 2.0123393970232795, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.94496154785156, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8627148270606995, "num_tokens": 603472802.0, "step": 15819 }, { "epoch": 2.01246660730187, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.26728439331055, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8670459985733032, "num_tokens": 603506547.0, "step": 15820 }, { "epoch": 2.0125938175804605, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.90572738647461, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8461905121803284, "num_tokens": 603544967.0, "step": 15821 }, { "epoch": 2.012721027859051, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.41754150390625, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8571521043777466, "num_tokens": 603582522.0, "step": 15822 }, { "epoch": 2.0128482381376416, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.61750030517578, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8724662661552429, "num_tokens": 603619584.0, "step": 15823 }, { "epoch": 2.012975448416232, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 44.91946029663086, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8504050374031067, "num_tokens": 603660542.0, "step": 15824 }, { "epoch": 2.0131026586948226, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.000362396240234, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.864679753780365, "num_tokens": 603698900.0, "step": 15825 }, { "epoch": 2.013229868973413, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.12138748168945, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8434940576553345, "num_tokens": 603733019.0, "step": 15826 }, { "epoch": 2.0133570792520037, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.985538482666016, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.86971116065979, "num_tokens": 603772071.0, "step": 15827 }, { "epoch": 2.013484289530594, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.277191162109375, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8549487590789795, "num_tokens": 603811008.0, "step": 15828 }, { "epoch": 2.0136114998091847, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.42667007446289, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8515702486038208, "num_tokens": 603852647.0, "step": 15829 }, { "epoch": 2.0137387100877753, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.25834274291992, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.862022340297699, "num_tokens": 603894198.0, "step": 15830 }, { "epoch": 2.013865920366366, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.21875762939453, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8576894998550415, "num_tokens": 603930168.0, "step": 15831 }, { "epoch": 2.0139931306449563, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.251346588134766, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8719092011451721, "num_tokens": 603966189.0, "step": 15832 }, { "epoch": 2.0141203409235464, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.623233795166016, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8588533401489258, "num_tokens": 604011110.0, "step": 15833 }, { "epoch": 2.014247551202137, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014209747314453125, "grad_norm": 45.63410568237305, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8447223901748657, "num_tokens": 604044256.0, "step": 15834 }, { "epoch": 2.0143747614807275, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.16377258300781, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8703845739364624, "num_tokens": 604078652.0, "step": 15835 }, { "epoch": 2.014501971759318, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.98828887939453, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8710869550704956, "num_tokens": 604112302.0, "step": 15836 }, { "epoch": 2.0146291820379085, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.788997650146484, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8775924444198608, "num_tokens": 604150011.0, "step": 15837 }, { "epoch": 2.014756392316499, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.23815155029297, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8629628419876099, "num_tokens": 604188859.0, "step": 15838 }, { "epoch": 2.0148836025950896, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.37086868286133, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8569555282592773, "num_tokens": 604232273.0, "step": 15839 }, { "epoch": 2.01501081287368, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.092308044433594, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8711056709289551, "num_tokens": 604265236.0, "step": 15840 }, { "epoch": 2.0151380231522706, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.80653762817383, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8769587278366089, "num_tokens": 604304784.0, "step": 15841 }, { "epoch": 2.015265233430861, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.43677520751953, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8881097435951233, "num_tokens": 604341461.0, "step": 15842 }, { "epoch": 2.0153924437094517, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.481300354003906, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8691670894622803, "num_tokens": 604378628.0, "step": 15843 }, { "epoch": 2.0155196539880422, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.12528991699219, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8715256452560425, "num_tokens": 604418070.0, "step": 15844 }, { "epoch": 2.0156468642666328, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.328392028808594, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8739979267120361, "num_tokens": 604454445.0, "step": 15845 }, { "epoch": 2.0157740745452233, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.82545852661133, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8658866882324219, "num_tokens": 604488559.0, "step": 15846 }, { "epoch": 2.015901284823814, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.047393798828125, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8553864359855652, "num_tokens": 604522538.0, "step": 15847 }, { "epoch": 2.0160284951024043, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.858612060546875, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8815902471542358, "num_tokens": 604561750.0, "step": 15848 }, { "epoch": 2.016155705380995, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.99427795410156, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8771969079971313, "num_tokens": 604594961.0, "step": 15849 }, { "epoch": 2.0162829156595854, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.93547821044922, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8533604741096497, "num_tokens": 604631301.0, "step": 15850 }, { "epoch": 2.016410125938176, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.79117202758789, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8420324325561523, "num_tokens": 604666049.0, "step": 15851 }, { "epoch": 2.0165373362167665, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.51890563964844, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8833480477333069, "num_tokens": 604701984.0, "step": 15852 }, { "epoch": 2.016664546495357, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.161376953125, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8745695948600769, "num_tokens": 604738414.0, "step": 15853 }, { "epoch": 2.0167917567739475, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.13652801513672, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8660503625869751, "num_tokens": 604779961.0, "step": 15854 }, { "epoch": 2.016918967052538, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.079227447509766, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8569287061691284, "num_tokens": 604819729.0, "step": 15855 }, { "epoch": 2.0170461773311286, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.58134460449219, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.8737500309944153, "num_tokens": 604864108.0, "step": 15856 }, { "epoch": 2.0171733876097186, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.476871490478516, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8561366200447083, "num_tokens": 604905582.0, "step": 15857 }, { "epoch": 2.017300597888309, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.7727165222168, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8703625202178955, "num_tokens": 604943931.0, "step": 15858 }, { "epoch": 2.0174278081668997, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.29804992675781, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8596943616867065, "num_tokens": 604982100.0, "step": 15859 }, { "epoch": 2.0175550184454902, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.72476577758789, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8531357049942017, "num_tokens": 605026289.0, "step": 15860 }, { "epoch": 2.0176822287240808, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.353416442871094, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8671690225601196, "num_tokens": 605070214.0, "step": 15861 }, { "epoch": 2.0178094390026713, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.27431869506836, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8584683537483215, "num_tokens": 605113495.0, "step": 15862 }, { "epoch": 2.017936649281262, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.16720962524414, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8652987480163574, "num_tokens": 605145717.0, "step": 15863 }, { "epoch": 2.0180638595598523, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.663795471191406, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8560295104980469, "num_tokens": 605180930.0, "step": 15864 }, { "epoch": 2.018191069838443, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.7483024597168, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8703243732452393, "num_tokens": 605214004.0, "step": 15865 }, { "epoch": 2.0183182801170334, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.65991973876953, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8651008009910583, "num_tokens": 605253244.0, "step": 15866 }, { "epoch": 2.018445490395624, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.594966888427734, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8592095375061035, "num_tokens": 605293832.0, "step": 15867 }, { "epoch": 2.0185727006742145, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.31632614135742, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8727333545684814, "num_tokens": 605333811.0, "step": 15868 }, { "epoch": 2.018699910952805, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.02762222290039, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8635550737380981, "num_tokens": 605371560.0, "step": 15869 }, { "epoch": 2.0188271212313955, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.32129669189453, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.8603105545043945, "num_tokens": 605410555.0, "step": 15870 }, { "epoch": 2.018954331509986, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.72869110107422, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8581755757331848, "num_tokens": 605451851.0, "step": 15871 }, { "epoch": 2.0190815417885766, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.31937026977539, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8534640669822693, "num_tokens": 605490442.0, "step": 15872 }, { "epoch": 2.019208752067167, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.00078582763672, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8606151342391968, "num_tokens": 605525237.0, "step": 15873 }, { "epoch": 2.0193359623457576, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.399471282958984, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8680614233016968, "num_tokens": 605563624.0, "step": 15874 }, { "epoch": 2.019463172624348, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.633056640625, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8719212412834167, "num_tokens": 605597419.0, "step": 15875 }, { "epoch": 2.0195903829029387, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.53215408325195, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8535748720169067, "num_tokens": 605639954.0, "step": 15876 }, { "epoch": 2.019717593181529, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.115657806396484, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8634750843048096, "num_tokens": 605678226.0, "step": 15877 }, { "epoch": 2.0198448034601197, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.08107376098633, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8712456822395325, "num_tokens": 605719925.0, "step": 15878 }, { "epoch": 2.0199720137387103, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.34010696411133, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8446781039237976, "num_tokens": 605761282.0, "step": 15879 }, { "epoch": 2.020099224017301, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.4998664855957, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8733835220336914, "num_tokens": 605800681.0, "step": 15880 }, { "epoch": 2.0202264342958913, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.15098571777344, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8449307680130005, "num_tokens": 605836282.0, "step": 15881 }, { "epoch": 2.0203536445744814, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.84836959838867, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8653882741928101, "num_tokens": 605880360.0, "step": 15882 }, { "epoch": 2.020480854853072, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.216575622558594, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8724590539932251, "num_tokens": 605918363.0, "step": 15883 }, { "epoch": 2.0206080651316625, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.742164611816406, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.876588761806488, "num_tokens": 605953032.0, "step": 15884 }, { "epoch": 2.020735275410253, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.82781982421875, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.870524525642395, "num_tokens": 605992633.0, "step": 15885 }, { "epoch": 2.0208624856888435, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.09532165527344, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8653566241264343, "num_tokens": 606031776.0, "step": 15886 }, { "epoch": 2.020989695967434, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.50039291381836, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8558346033096313, "num_tokens": 606070614.0, "step": 15887 }, { "epoch": 2.0211169062460246, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.540283203125, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.863776445388794, "num_tokens": 606115898.0, "step": 15888 }, { "epoch": 2.021244116524615, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.43117904663086, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8672686815261841, "num_tokens": 606150231.0, "step": 15889 }, { "epoch": 2.0213713268032056, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.937259674072266, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8542380332946777, "num_tokens": 606191527.0, "step": 15890 }, { "epoch": 2.021498537081796, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.48239517211914, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8545931577682495, "num_tokens": 606229070.0, "step": 15891 }, { "epoch": 2.0216257473603867, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.86244201660156, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8634840250015259, "num_tokens": 606269555.0, "step": 15892 }, { "epoch": 2.021752957638977, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 45.468021392822266, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8584418296813965, "num_tokens": 606302040.0, "step": 15893 }, { "epoch": 2.0218801679175677, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.59910202026367, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8726809024810791, "num_tokens": 606341769.0, "step": 15894 }, { "epoch": 2.0220073781961583, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 46.060508728027344, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8497871160507202, "num_tokens": 606379989.0, "step": 15895 }, { "epoch": 2.022134588474749, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.447898864746094, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8566702604293823, "num_tokens": 606414396.0, "step": 15896 }, { "epoch": 2.0222617987533393, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.797279357910156, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8655475378036499, "num_tokens": 606445949.0, "step": 15897 }, { "epoch": 2.02238900903193, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.18671798706055, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8666048645973206, "num_tokens": 606484751.0, "step": 15898 }, { "epoch": 2.0225162193105204, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.093753814697266, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8669106960296631, "num_tokens": 606525390.0, "step": 15899 }, { "epoch": 2.022643429589111, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.424156188964844, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8598349094390869, "num_tokens": 606563364.0, "step": 15900 }, { "epoch": 2.0227706398677014, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.53692626953125, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.865882158279419, "num_tokens": 606598140.0, "step": 15901 }, { "epoch": 2.022897850146292, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.867794036865234, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8555878400802612, "num_tokens": 606636480.0, "step": 15902 }, { "epoch": 2.0230250604248825, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.08317184448242, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8664032816886902, "num_tokens": 606676044.0, "step": 15903 }, { "epoch": 2.023152270703473, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.507057189941406, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8549897074699402, "num_tokens": 606713262.0, "step": 15904 }, { "epoch": 2.0232794809820636, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.38090896606445, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8580954074859619, "num_tokens": 606754844.0, "step": 15905 }, { "epoch": 2.0234066912606536, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.519561767578125, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8642256855964661, "num_tokens": 606800910.0, "step": 15906 }, { "epoch": 2.023533901539244, "ewc_loss": 0.166015625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014400482177734375, "grad_norm": 46.51652526855469, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8583471775054932, "num_tokens": 606841855.0, "step": 15907 }, { "epoch": 2.0236611118178347, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.84236145019531, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.852137565612793, "num_tokens": 606888261.0, "step": 15908 }, { "epoch": 2.0237883220964252, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.56919479370117, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.854587972164154, "num_tokens": 606926235.0, "step": 15909 }, { "epoch": 2.0239155323750158, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.33271408081055, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8573960661888123, "num_tokens": 606962527.0, "step": 15910 }, { "epoch": 2.0240427426536063, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.91670227050781, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8684926629066467, "num_tokens": 607008924.0, "step": 15911 }, { "epoch": 2.024169952932197, "ewc_loss": 0.1650390625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001430511474609375, "grad_norm": 45.3510627746582, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8596354126930237, "num_tokens": 607049788.0, "step": 15912 }, { "epoch": 2.0242971632107873, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.38895797729492, "learning_rate": 1e-06, "loss": 0.5989, "mean_token_accuracy": 0.8563508987426758, "num_tokens": 607087592.0, "step": 15913 }, { "epoch": 2.024424373489378, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.52024841308594, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8770850300788879, "num_tokens": 607119954.0, "step": 15914 }, { "epoch": 2.0245515837679684, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.07329559326172, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8531067967414856, "num_tokens": 607156511.0, "step": 15915 }, { "epoch": 2.024678794046559, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.66162109375, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8700984716415405, "num_tokens": 607198292.0, "step": 15916 }, { "epoch": 2.0248060043251495, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.170570373535156, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8601905107498169, "num_tokens": 607237572.0, "step": 15917 }, { "epoch": 2.02493321460374, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.31793975830078, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8541992902755737, "num_tokens": 607282321.0, "step": 15918 }, { "epoch": 2.0250604248823305, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.53023910522461, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.860126256942749, "num_tokens": 607313355.0, "step": 15919 }, { "epoch": 2.025187635160921, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.70820617675781, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8599467873573303, "num_tokens": 607351961.0, "step": 15920 }, { "epoch": 2.0253148454395116, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.0269775390625, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8649773597717285, "num_tokens": 607391368.0, "step": 15921 }, { "epoch": 2.025442055718102, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.55910110473633, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8809272050857544, "num_tokens": 607427328.0, "step": 15922 }, { "epoch": 2.0255692659966926, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.59953689575195, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8581725358963013, "num_tokens": 607468243.0, "step": 15923 }, { "epoch": 2.025696476275283, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.9536018371582, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8461868762969971, "num_tokens": 607502466.0, "step": 15924 }, { "epoch": 2.0258236865538737, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.06647491455078, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8539178967475891, "num_tokens": 607544093.0, "step": 15925 }, { "epoch": 2.025950896832464, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.77229690551758, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8686016201972961, "num_tokens": 607576054.0, "step": 15926 }, { "epoch": 2.0260781071110547, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.76402282714844, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8764209747314453, "num_tokens": 607608462.0, "step": 15927 }, { "epoch": 2.0262053173896453, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.02690124511719, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8547029495239258, "num_tokens": 607645272.0, "step": 15928 }, { "epoch": 2.026332527668236, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.765010833740234, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8633402585983276, "num_tokens": 607682585.0, "step": 15929 }, { "epoch": 2.0264597379468263, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.79128646850586, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8805413246154785, "num_tokens": 607721302.0, "step": 15930 }, { "epoch": 2.0265869482254164, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.95234680175781, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8568436503410339, "num_tokens": 607758357.0, "step": 15931 }, { "epoch": 2.026714158504007, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.80592346191406, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8629148006439209, "num_tokens": 607795467.0, "step": 15932 }, { "epoch": 2.0268413687825975, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.074581146240234, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8584494590759277, "num_tokens": 607833192.0, "step": 15933 }, { "epoch": 2.026968579061188, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.66415786743164, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.871593177318573, "num_tokens": 607866672.0, "step": 15934 }, { "epoch": 2.0270957893397785, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.111915588378906, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8631946444511414, "num_tokens": 607907976.0, "step": 15935 }, { "epoch": 2.027222999618369, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.464900970458984, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8658999800682068, "num_tokens": 607946009.0, "step": 15936 }, { "epoch": 2.0273502098969596, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.248775482177734, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.867909848690033, "num_tokens": 607986800.0, "step": 15937 }, { "epoch": 2.02747742017555, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.19703674316406, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8570479154586792, "num_tokens": 608021807.0, "step": 15938 }, { "epoch": 2.0276046304541406, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.30046463012695, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8614847660064697, "num_tokens": 608060294.0, "step": 15939 }, { "epoch": 2.027731840732731, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.615966796875, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8625878691673279, "num_tokens": 608100790.0, "step": 15940 }, { "epoch": 2.0278590510113217, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.38854217529297, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8581141829490662, "num_tokens": 608142164.0, "step": 15941 }, { "epoch": 2.027986261289912, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.84357833862305, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8632372617721558, "num_tokens": 608184175.0, "step": 15942 }, { "epoch": 2.0281134715685027, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.77535629272461, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.857537031173706, "num_tokens": 608226801.0, "step": 15943 }, { "epoch": 2.0282406818470933, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.77168655395508, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8708783388137817, "num_tokens": 608265155.0, "step": 15944 }, { "epoch": 2.028367892125684, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.898345947265625, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8729303479194641, "num_tokens": 608303817.0, "step": 15945 }, { "epoch": 2.0284951024042743, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.651947021484375, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.869846761226654, "num_tokens": 608338845.0, "step": 15946 }, { "epoch": 2.028622312682865, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.67163848876953, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8627964854240417, "num_tokens": 608377661.0, "step": 15947 }, { "epoch": 2.0287495229614554, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.710906982421875, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8625873327255249, "num_tokens": 608412897.0, "step": 15948 }, { "epoch": 2.028876733240046, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.551849365234375, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8546364307403564, "num_tokens": 608453584.0, "step": 15949 }, { "epoch": 2.0290039435186364, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.82515335083008, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8576356172561646, "num_tokens": 608492211.0, "step": 15950 }, { "epoch": 2.029131153797227, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.684486389160156, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8588368892669678, "num_tokens": 608534317.0, "step": 15951 }, { "epoch": 2.0292583640758175, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.77268600463867, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8727635741233826, "num_tokens": 608569763.0, "step": 15952 }, { "epoch": 2.029385574354408, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.56725311279297, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8653088212013245, "num_tokens": 608612145.0, "step": 15953 }, { "epoch": 2.0295127846329986, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.9968147277832, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8616490364074707, "num_tokens": 608649528.0, "step": 15954 }, { "epoch": 2.0296399949115886, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.39226150512695, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8562859296798706, "num_tokens": 608686973.0, "step": 15955 }, { "epoch": 2.029767205190179, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.814247131347656, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8651273846626282, "num_tokens": 608729169.0, "step": 15956 }, { "epoch": 2.0298944154687697, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.118709564208984, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8607845306396484, "num_tokens": 608764502.0, "step": 15957 }, { "epoch": 2.0300216257473602, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.38269805908203, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8614342212677002, "num_tokens": 608805190.0, "step": 15958 }, { "epoch": 2.0301488360259508, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.88191604614258, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8687372207641602, "num_tokens": 608840006.0, "step": 15959 }, { "epoch": 2.0302760463045413, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.44983673095703, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8690229654312134, "num_tokens": 608871551.0, "step": 15960 }, { "epoch": 2.030403256583132, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.249629974365234, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8414978384971619, "num_tokens": 608909610.0, "step": 15961 }, { "epoch": 2.0305304668617223, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.537105560302734, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8596128225326538, "num_tokens": 608950758.0, "step": 15962 }, { "epoch": 2.030657677140313, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.93644332885742, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8507475852966309, "num_tokens": 608990695.0, "step": 15963 }, { "epoch": 2.0307848874189034, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.68642044067383, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8769042491912842, "num_tokens": 609030978.0, "step": 15964 }, { "epoch": 2.030912097697494, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.11680221557617, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8730025887489319, "num_tokens": 609071974.0, "step": 15965 }, { "epoch": 2.0310393079760845, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.77333450317383, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8648632764816284, "num_tokens": 609111268.0, "step": 15966 }, { "epoch": 2.031166518254675, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.90229797363281, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8588690757751465, "num_tokens": 609149681.0, "step": 15967 }, { "epoch": 2.0312937285332655, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.00747299194336, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.86845862865448, "num_tokens": 609189121.0, "step": 15968 }, { "epoch": 2.031420938811856, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.43986511230469, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8729567527770996, "num_tokens": 609228070.0, "step": 15969 }, { "epoch": 2.0315481490904466, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.26510238647461, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.865584671497345, "num_tokens": 609269718.0, "step": 15970 }, { "epoch": 2.031675359369037, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.40755844116211, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.865522563457489, "num_tokens": 609307236.0, "step": 15971 }, { "epoch": 2.0318025696476276, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.82008361816406, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8451708555221558, "num_tokens": 609342993.0, "step": 15972 }, { "epoch": 2.031929779926218, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.47583770751953, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8696349859237671, "num_tokens": 609376817.0, "step": 15973 }, { "epoch": 2.0320569902048087, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.83893966674805, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8603731989860535, "num_tokens": 609411209.0, "step": 15974 }, { "epoch": 2.032184200483399, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.83770751953125, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8801920413970947, "num_tokens": 609449055.0, "step": 15975 }, { "epoch": 2.0323114107619897, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.78009033203125, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8623257875442505, "num_tokens": 609486332.0, "step": 15976 }, { "epoch": 2.0324386210405803, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.83262634277344, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8498523235321045, "num_tokens": 609526924.0, "step": 15977 }, { "epoch": 2.032565831319171, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.746498107910156, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8773297071456909, "num_tokens": 609559542.0, "step": 15978 }, { "epoch": 2.032693041597761, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.41530990600586, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8560396432876587, "num_tokens": 609598989.0, "step": 15979 }, { "epoch": 2.0328202518763514, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.58336639404297, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.850419819355011, "num_tokens": 609635790.0, "step": 15980 }, { "epoch": 2.032947462154942, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.02622604370117, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8563216924667358, "num_tokens": 609670120.0, "step": 15981 }, { "epoch": 2.0330746724335325, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.79872131347656, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.861853837966919, "num_tokens": 609713260.0, "step": 15982 }, { "epoch": 2.033201882712123, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.505069732666016, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.868874192237854, "num_tokens": 609753236.0, "step": 15983 }, { "epoch": 2.0333290929907135, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.514102935791016, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8656986951828003, "num_tokens": 609791629.0, "step": 15984 }, { "epoch": 2.033456303269304, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.40350341796875, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.846922755241394, "num_tokens": 609833457.0, "step": 15985 }, { "epoch": 2.0335835135478946, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.11207962036133, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8558642864227295, "num_tokens": 609866007.0, "step": 15986 }, { "epoch": 2.033710723826485, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.989349365234375, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8672121167182922, "num_tokens": 609903346.0, "step": 15987 }, { "epoch": 2.0338379341050756, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.96632385253906, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8675850629806519, "num_tokens": 609935885.0, "step": 15988 }, { "epoch": 2.033965144383666, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.851173400878906, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8568639159202576, "num_tokens": 609971127.0, "step": 15989 }, { "epoch": 2.0340923546622567, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.18808364868164, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.85440593957901, "num_tokens": 610016719.0, "step": 15990 }, { "epoch": 2.034219564940847, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.186790466308594, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8522953987121582, "num_tokens": 610057002.0, "step": 15991 }, { "epoch": 2.0343467752194377, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.062713623046875, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.868134617805481, "num_tokens": 610092779.0, "step": 15992 }, { "epoch": 2.0344739854980283, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.8544807434082, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8520712852478027, "num_tokens": 610137092.0, "step": 15993 }, { "epoch": 2.034601195776619, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.44329833984375, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8723375797271729, "num_tokens": 610178632.0, "step": 15994 }, { "epoch": 2.0347284060552093, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.860286712646484, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8608599305152893, "num_tokens": 610219789.0, "step": 15995 }, { "epoch": 2.0348556163338, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.91181564331055, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8490098714828491, "num_tokens": 610259104.0, "step": 15996 }, { "epoch": 2.0349828266123904, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.36079406738281, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.856913685798645, "num_tokens": 610296317.0, "step": 15997 }, { "epoch": 2.035110036890981, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.60546112060547, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8615065813064575, "num_tokens": 610339300.0, "step": 15998 }, { "epoch": 2.0352372471695714, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.02039337158203, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8579192161560059, "num_tokens": 610373917.0, "step": 15999 }, { "epoch": 2.035364457448162, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.33760452270508, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8618817329406738, "num_tokens": 610419571.0, "step": 16000 }, { "epoch": 2.0354916677267525, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.972110748291016, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8560435771942139, "num_tokens": 610458073.0, "step": 16001 }, { "epoch": 2.035618878005343, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.9642219543457, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8505285978317261, "num_tokens": 610497223.0, "step": 16002 }, { "epoch": 2.0357460882839336, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.180171966552734, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8421043157577515, "num_tokens": 610532563.0, "step": 16003 }, { "epoch": 2.0358732985625236, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.715309143066406, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8756958246231079, "num_tokens": 610566644.0, "step": 16004 }, { "epoch": 2.036000508841114, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.531978607177734, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8501678705215454, "num_tokens": 610604755.0, "step": 16005 }, { "epoch": 2.0361277191197047, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.57570266723633, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8665292859077454, "num_tokens": 610645905.0, "step": 16006 }, { "epoch": 2.036254929398295, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.84341812133789, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8798146843910217, "num_tokens": 610686800.0, "step": 16007 }, { "epoch": 2.0363821396768857, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.407222747802734, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8617009520530701, "num_tokens": 610728291.0, "step": 16008 }, { "epoch": 2.0365093499554763, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.83993911743164, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.859176754951477, "num_tokens": 610759818.0, "step": 16009 }, { "epoch": 2.036636560234067, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.244789123535156, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8477033376693726, "num_tokens": 610800125.0, "step": 16010 }, { "epoch": 2.0367637705126573, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.57394790649414, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.853560209274292, "num_tokens": 610839261.0, "step": 16011 }, { "epoch": 2.036890980791248, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.16359329223633, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8670263886451721, "num_tokens": 610879092.0, "step": 16012 }, { "epoch": 2.0370181910698384, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.94468307495117, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8624459505081177, "num_tokens": 610914494.0, "step": 16013 }, { "epoch": 2.037145401348429, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.712890625, "learning_rate": 1e-06, "loss": 0.5464, "mean_token_accuracy": 0.873283326625824, "num_tokens": 610950784.0, "step": 16014 }, { "epoch": 2.0372726116270194, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.225677490234375, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8505444526672363, "num_tokens": 610989254.0, "step": 16015 }, { "epoch": 2.03739982190561, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.92581558227539, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8548665046691895, "num_tokens": 611022668.0, "step": 16016 }, { "epoch": 2.0375270321842005, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.772403717041016, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8635166883468628, "num_tokens": 611062110.0, "step": 16017 }, { "epoch": 2.037654242462791, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.63322830200195, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8684374690055847, "num_tokens": 611098663.0, "step": 16018 }, { "epoch": 2.0377814527413816, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.69002151489258, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8584612607955933, "num_tokens": 611134058.0, "step": 16019 }, { "epoch": 2.037908663019972, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.00838088989258, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8620378971099854, "num_tokens": 611174748.0, "step": 16020 }, { "epoch": 2.0380358732985626, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.67313766479492, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.877303957939148, "num_tokens": 611216792.0, "step": 16021 }, { "epoch": 2.038163083577153, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.25941848754883, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8708094358444214, "num_tokens": 611253996.0, "step": 16022 }, { "epoch": 2.0382902938557437, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.83460998535156, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8714921474456787, "num_tokens": 611295351.0, "step": 16023 }, { "epoch": 2.038417504134334, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.002899169921875, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8621098399162292, "num_tokens": 611330513.0, "step": 16024 }, { "epoch": 2.0385447144129247, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.92598342895508, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8698691129684448, "num_tokens": 611369679.0, "step": 16025 }, { "epoch": 2.0386719246915153, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.177337646484375, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.870829164981842, "num_tokens": 611410398.0, "step": 16026 }, { "epoch": 2.038799134970106, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.706626892089844, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8537782430648804, "num_tokens": 611447151.0, "step": 16027 }, { "epoch": 2.0389263452486963, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.48333740234375, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8540905714035034, "num_tokens": 611486923.0, "step": 16028 }, { "epoch": 2.0390535555272864, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.74863052368164, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8676173686981201, "num_tokens": 611519502.0, "step": 16029 }, { "epoch": 2.039180765805877, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.424251556396484, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8491215705871582, "num_tokens": 611557647.0, "step": 16030 }, { "epoch": 2.0393079760844675, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.77952194213867, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8619921207427979, "num_tokens": 611597405.0, "step": 16031 }, { "epoch": 2.039435186363058, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.346656799316406, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8539972305297852, "num_tokens": 611635996.0, "step": 16032 }, { "epoch": 2.0395623966416485, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.8804931640625, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8700151443481445, "num_tokens": 611677900.0, "step": 16033 }, { "epoch": 2.039689606920239, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.947994232177734, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8651416301727295, "num_tokens": 611719184.0, "step": 16034 }, { "epoch": 2.0398168171988296, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.47776794433594, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8608479499816895, "num_tokens": 611761270.0, "step": 16035 }, { "epoch": 2.03994402747742, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.34604263305664, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8602887392044067, "num_tokens": 611794939.0, "step": 16036 }, { "epoch": 2.0400712377560106, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.329532623291016, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8744426965713501, "num_tokens": 611834093.0, "step": 16037 }, { "epoch": 2.040198448034601, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.608734130859375, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8574604988098145, "num_tokens": 611871327.0, "step": 16038 }, { "epoch": 2.0403256583131917, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.116844177246094, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8547384738922119, "num_tokens": 611907837.0, "step": 16039 }, { "epoch": 2.040452868591782, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.04054641723633, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8433148860931396, "num_tokens": 611950964.0, "step": 16040 }, { "epoch": 2.0405800788703727, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.96449279785156, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8579598665237427, "num_tokens": 611985867.0, "step": 16041 }, { "epoch": 2.0407072891489633, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.35057830810547, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8654841780662537, "num_tokens": 612017819.0, "step": 16042 }, { "epoch": 2.040834499427554, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.63091278076172, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8477689623832703, "num_tokens": 612060669.0, "step": 16043 }, { "epoch": 2.0409617097061443, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.639530181884766, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8481873869895935, "num_tokens": 612099801.0, "step": 16044 }, { "epoch": 2.041088919984735, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.517669677734375, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8666320443153381, "num_tokens": 612139454.0, "step": 16045 }, { "epoch": 2.0412161302633254, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.59441375732422, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8423665165901184, "num_tokens": 612181695.0, "step": 16046 }, { "epoch": 2.041343340541916, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.5114631652832, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8772917985916138, "num_tokens": 612223200.0, "step": 16047 }, { "epoch": 2.0414705508205064, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.547183990478516, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8640059232711792, "num_tokens": 612265946.0, "step": 16048 }, { "epoch": 2.041597761099097, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.672264099121094, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8790786266326904, "num_tokens": 612301246.0, "step": 16049 }, { "epoch": 2.0417249713776875, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.70977783203125, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8477193117141724, "num_tokens": 612337789.0, "step": 16050 }, { "epoch": 2.041852181656278, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.80345153808594, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8743811845779419, "num_tokens": 612372391.0, "step": 16051 }, { "epoch": 2.0419793919348685, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.68622589111328, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8644240498542786, "num_tokens": 612408806.0, "step": 16052 }, { "epoch": 2.0421066022134586, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.58909606933594, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8733037710189819, "num_tokens": 612446349.0, "step": 16053 }, { "epoch": 2.042233812492049, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.171295166015625, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8735707402229309, "num_tokens": 612489899.0, "step": 16054 }, { "epoch": 2.0423610227706397, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.9873046875, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8587608933448792, "num_tokens": 612532314.0, "step": 16055 }, { "epoch": 2.04248823304923, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.03340148925781, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8697353601455688, "num_tokens": 612568033.0, "step": 16056 }, { "epoch": 2.0426154433278207, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.685665130615234, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8698402047157288, "num_tokens": 612605160.0, "step": 16057 }, { "epoch": 2.0427426536064113, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.22129440307617, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8500943183898926, "num_tokens": 612647851.0, "step": 16058 }, { "epoch": 2.042869863885002, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.97993469238281, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.862014651298523, "num_tokens": 612684770.0, "step": 16059 }, { "epoch": 2.0429970741635923, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.25718307495117, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.850923478603363, "num_tokens": 612721689.0, "step": 16060 }, { "epoch": 2.043124284442183, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.55294418334961, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8601763844490051, "num_tokens": 612757256.0, "step": 16061 }, { "epoch": 2.0432514947207734, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.09130859375, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.868578314781189, "num_tokens": 612797806.0, "step": 16062 }, { "epoch": 2.043378704999364, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.84396743774414, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8728047013282776, "num_tokens": 612833507.0, "step": 16063 }, { "epoch": 2.0435059152779544, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.12434768676758, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8685055375099182, "num_tokens": 612872920.0, "step": 16064 }, { "epoch": 2.043633125556545, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.79890441894531, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.8692765235900879, "num_tokens": 612909534.0, "step": 16065 }, { "epoch": 2.0437603358351355, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.91670608520508, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8585139513015747, "num_tokens": 612950531.0, "step": 16066 }, { "epoch": 2.043887546113726, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.14237594604492, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8671241998672485, "num_tokens": 612984638.0, "step": 16067 }, { "epoch": 2.0440147563923166, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.81547546386719, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8747022151947021, "num_tokens": 613020632.0, "step": 16068 }, { "epoch": 2.044141966670907, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.60926055908203, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8672783374786377, "num_tokens": 613057381.0, "step": 16069 }, { "epoch": 2.0442691769494976, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.23725891113281, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8612257242202759, "num_tokens": 613094287.0, "step": 16070 }, { "epoch": 2.044396387228088, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.017372131347656, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8701915740966797, "num_tokens": 613126315.0, "step": 16071 }, { "epoch": 2.0445235975066787, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.421634674072266, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8586368560791016, "num_tokens": 613160970.0, "step": 16072 }, { "epoch": 2.044650807785269, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.52696990966797, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.866263210773468, "num_tokens": 613204685.0, "step": 16073 }, { "epoch": 2.0447780180638597, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.6738166809082, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8723934888839722, "num_tokens": 613242394.0, "step": 16074 }, { "epoch": 2.0449052283424503, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.085609436035156, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8705724477767944, "num_tokens": 613283517.0, "step": 16075 }, { "epoch": 2.045032438621041, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.11192321777344, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8668235540390015, "num_tokens": 613317237.0, "step": 16076 }, { "epoch": 2.045159648899631, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.361907958984375, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8664060235023499, "num_tokens": 613355741.0, "step": 16077 }, { "epoch": 2.0452868591782214, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.30778121948242, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8533072471618652, "num_tokens": 613392131.0, "step": 16078 }, { "epoch": 2.045414069456812, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.32971954345703, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8545770645141602, "num_tokens": 613433687.0, "step": 16079 }, { "epoch": 2.0455412797354025, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.01850509643555, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8671485781669617, "num_tokens": 613472882.0, "step": 16080 }, { "epoch": 2.045668490013993, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.10776138305664, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8683065176010132, "num_tokens": 613515869.0, "step": 16081 }, { "epoch": 2.0457957002925835, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.41170120239258, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8635542392730713, "num_tokens": 613551835.0, "step": 16082 }, { "epoch": 2.045922910571174, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.834415435791016, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8619512319564819, "num_tokens": 613592277.0, "step": 16083 }, { "epoch": 2.0460501208497646, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.842525482177734, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8616372346878052, "num_tokens": 613625594.0, "step": 16084 }, { "epoch": 2.046177331128355, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.56528854370117, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.86527419090271, "num_tokens": 613659924.0, "step": 16085 }, { "epoch": 2.0463045414069456, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3851203918457, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8447048664093018, "num_tokens": 613697067.0, "step": 16086 }, { "epoch": 2.046431751685536, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.916748046875, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8716821670532227, "num_tokens": 613738211.0, "step": 16087 }, { "epoch": 2.0465589619641267, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.24906921386719, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.8687376976013184, "num_tokens": 613782590.0, "step": 16088 }, { "epoch": 2.046686172242717, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.89628601074219, "learning_rate": 1e-06, "loss": 0.5439, "mean_token_accuracy": 0.8715599775314331, "num_tokens": 613825770.0, "step": 16089 }, { "epoch": 2.0468133825213077, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.893089294433594, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8685760498046875, "num_tokens": 613864025.0, "step": 16090 }, { "epoch": 2.0469405927998983, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.428497314453125, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8654214143753052, "num_tokens": 613907181.0, "step": 16091 }, { "epoch": 2.047067803078489, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.289512634277344, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8539547324180603, "num_tokens": 613951258.0, "step": 16092 }, { "epoch": 2.0471950133570793, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.88337707519531, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8611624240875244, "num_tokens": 613991221.0, "step": 16093 }, { "epoch": 2.04732222363567, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.631553649902344, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8740087151527405, "num_tokens": 614026862.0, "step": 16094 }, { "epoch": 2.0474494339142604, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.60024642944336, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8536157608032227, "num_tokens": 614067560.0, "step": 16095 }, { "epoch": 2.047576644192851, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.556800842285156, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8663589954376221, "num_tokens": 614106974.0, "step": 16096 }, { "epoch": 2.0477038544714414, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.33607864379883, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8908383846282959, "num_tokens": 614145641.0, "step": 16097 }, { "epoch": 2.047831064750032, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.91040802001953, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8676923513412476, "num_tokens": 614181476.0, "step": 16098 }, { "epoch": 2.0479582750286225, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.536861419677734, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8587098121643066, "num_tokens": 614219212.0, "step": 16099 }, { "epoch": 2.048085485307213, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.448665618896484, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8336570262908936, "num_tokens": 614264017.0, "step": 16100 }, { "epoch": 2.0482126955858035, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.17890930175781, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8458769917488098, "num_tokens": 614299031.0, "step": 16101 }, { "epoch": 2.0483399058643936, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.00016403198242, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8706268072128296, "num_tokens": 614334387.0, "step": 16102 }, { "epoch": 2.048467116142984, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.85546875, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8634697198867798, "num_tokens": 614372438.0, "step": 16103 }, { "epoch": 2.0485943264215747, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.52041244506836, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8573287129402161, "num_tokens": 614409976.0, "step": 16104 }, { "epoch": 2.048721536700165, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.94975280761719, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8709259033203125, "num_tokens": 614448740.0, "step": 16105 }, { "epoch": 2.0488487469787557, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.48112487792969, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8674915432929993, "num_tokens": 614484644.0, "step": 16106 }, { "epoch": 2.0489759572573463, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.20330047607422, "learning_rate": 1e-06, "loss": 0.6354, "mean_token_accuracy": 0.8437551259994507, "num_tokens": 614521210.0, "step": 16107 }, { "epoch": 2.049103167535937, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.2717399597168, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8571372032165527, "num_tokens": 614561783.0, "step": 16108 }, { "epoch": 2.0492303778145273, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.9563102722168, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8644713163375854, "num_tokens": 614598643.0, "step": 16109 }, { "epoch": 2.049357588093118, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.72645568847656, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8614072799682617, "num_tokens": 614633520.0, "step": 16110 }, { "epoch": 2.0494847983717084, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.99290466308594, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8482733368873596, "num_tokens": 614673266.0, "step": 16111 }, { "epoch": 2.049612008650299, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.94028091430664, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8539473414421082, "num_tokens": 614707201.0, "step": 16112 }, { "epoch": 2.0497392189288894, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.6212158203125, "learning_rate": 1e-06, "loss": 0.5558, "mean_token_accuracy": 0.8671755790710449, "num_tokens": 614746542.0, "step": 16113 }, { "epoch": 2.04986642920748, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.60528564453125, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8517231941223145, "num_tokens": 614781804.0, "step": 16114 }, { "epoch": 2.0499936394860705, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.8841438293457, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8629554510116577, "num_tokens": 614817281.0, "step": 16115 }, { "epoch": 2.050120849764661, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.22941970825195, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.853408932685852, "num_tokens": 614853666.0, "step": 16116 }, { "epoch": 2.0502480600432516, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.092647552490234, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.864280641078949, "num_tokens": 614891221.0, "step": 16117 }, { "epoch": 2.050375270321842, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.48116683959961, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8711149096488953, "num_tokens": 614925956.0, "step": 16118 }, { "epoch": 2.0505024806004326, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.99613952636719, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8696083426475525, "num_tokens": 614959934.0, "step": 16119 }, { "epoch": 2.050629690879023, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.92926025390625, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8596168160438538, "num_tokens": 615000579.0, "step": 16120 }, { "epoch": 2.0507569011576137, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.50457000732422, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8563175201416016, "num_tokens": 615037371.0, "step": 16121 }, { "epoch": 2.050884111436204, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.070980072021484, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8532034158706665, "num_tokens": 615075948.0, "step": 16122 }, { "epoch": 2.0510113217147947, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.37618637084961, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8783011436462402, "num_tokens": 615114619.0, "step": 16123 }, { "epoch": 2.0511385319933853, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.61557388305664, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8640574216842651, "num_tokens": 615149260.0, "step": 16124 }, { "epoch": 2.051265742271976, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.94231033325195, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.865943431854248, "num_tokens": 615183284.0, "step": 16125 }, { "epoch": 2.0513929525505663, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.962825775146484, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8696910738945007, "num_tokens": 615220015.0, "step": 16126 }, { "epoch": 2.0515201628291564, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.1039924621582, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8563559055328369, "num_tokens": 615255614.0, "step": 16127 }, { "epoch": 2.051647373107747, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.392757415771484, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.862889289855957, "num_tokens": 615290232.0, "step": 16128 }, { "epoch": 2.0517745833863374, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.232322692871094, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8683069348335266, "num_tokens": 615328204.0, "step": 16129 }, { "epoch": 2.051901793664928, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.4661979675293, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8558036088943481, "num_tokens": 615362846.0, "step": 16130 }, { "epoch": 2.0520290039435185, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.36632537841797, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8612159490585327, "num_tokens": 615405404.0, "step": 16131 }, { "epoch": 2.052156214222109, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.31782150268555, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8511145114898682, "num_tokens": 615439835.0, "step": 16132 }, { "epoch": 2.0522834245006996, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.60824203491211, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.873729407787323, "num_tokens": 615477832.0, "step": 16133 }, { "epoch": 2.05241063477929, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.92121887207031, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8542078733444214, "num_tokens": 615518089.0, "step": 16134 }, { "epoch": 2.0525378450578806, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.19533157348633, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8674149513244629, "num_tokens": 615557286.0, "step": 16135 }, { "epoch": 2.052665055336471, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.02885437011719, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8666722774505615, "num_tokens": 615593284.0, "step": 16136 }, { "epoch": 2.0527922656150617, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.580047607421875, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8590776324272156, "num_tokens": 615633579.0, "step": 16137 }, { "epoch": 2.052919475893652, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.64388656616211, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8690182566642761, "num_tokens": 615680135.0, "step": 16138 }, { "epoch": 2.0530466861722427, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.5389289855957, "learning_rate": 1e-06, "loss": 0.6219, "mean_token_accuracy": 0.8503376245498657, "num_tokens": 615714801.0, "step": 16139 }, { "epoch": 2.0531738964508333, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.9219856262207, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8564032316207886, "num_tokens": 615750538.0, "step": 16140 }, { "epoch": 2.053301106729424, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.88316345214844, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.8671165704727173, "num_tokens": 615789465.0, "step": 16141 }, { "epoch": 2.0534283170080143, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.97422409057617, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8485720157623291, "num_tokens": 615832553.0, "step": 16142 }, { "epoch": 2.053555527286605, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.91432571411133, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8590266704559326, "num_tokens": 615868341.0, "step": 16143 }, { "epoch": 2.0536827375651954, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.87529373168945, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8733811378479004, "num_tokens": 615904324.0, "step": 16144 }, { "epoch": 2.053809947843786, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.506839752197266, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8708657026290894, "num_tokens": 615947656.0, "step": 16145 }, { "epoch": 2.0539371581223764, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.0823860168457, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8576916456222534, "num_tokens": 615989050.0, "step": 16146 }, { "epoch": 2.054064368400967, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.33203125, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8677133321762085, "num_tokens": 616027638.0, "step": 16147 }, { "epoch": 2.0541915786795575, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.36691665649414, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8701444864273071, "num_tokens": 616066392.0, "step": 16148 }, { "epoch": 2.054318788958148, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.216514587402344, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8718200325965881, "num_tokens": 616108303.0, "step": 16149 }, { "epoch": 2.0544459992367385, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.61883544921875, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8550965785980225, "num_tokens": 616154174.0, "step": 16150 }, { "epoch": 2.0545732095153286, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.331729888916016, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8601788282394409, "num_tokens": 616193954.0, "step": 16151 }, { "epoch": 2.054700419793919, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.33974838256836, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8681081533432007, "num_tokens": 616232771.0, "step": 16152 }, { "epoch": 2.0548276300725097, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.41996765136719, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.874055802822113, "num_tokens": 616265151.0, "step": 16153 }, { "epoch": 2.0549548403511, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.307464599609375, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8565959334373474, "num_tokens": 616303886.0, "step": 16154 }, { "epoch": 2.0550820506296907, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.21917724609375, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8732938170433044, "num_tokens": 616340040.0, "step": 16155 }, { "epoch": 2.0552092609082813, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.23348617553711, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8663506507873535, "num_tokens": 616371329.0, "step": 16156 }, { "epoch": 2.055336471186872, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.21586990356445, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8766034841537476, "num_tokens": 616406383.0, "step": 16157 }, { "epoch": 2.0554636814654623, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.395957946777344, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.849339485168457, "num_tokens": 616445600.0, "step": 16158 }, { "epoch": 2.055590891744053, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.95977020263672, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8553802967071533, "num_tokens": 616485376.0, "step": 16159 }, { "epoch": 2.0557181020226434, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3934440612793, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8522659540176392, "num_tokens": 616527104.0, "step": 16160 }, { "epoch": 2.055845312301234, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.76637268066406, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8657456636428833, "num_tokens": 616567557.0, "step": 16161 }, { "epoch": 2.0559725225798244, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.57614517211914, "learning_rate": 1e-06, "loss": 0.559, "mean_token_accuracy": 0.867073118686676, "num_tokens": 616601612.0, "step": 16162 }, { "epoch": 2.056099732858415, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.589210510253906, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.861808717250824, "num_tokens": 616638019.0, "step": 16163 }, { "epoch": 2.0562269431370055, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.94833755493164, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8616693019866943, "num_tokens": 616675224.0, "step": 16164 }, { "epoch": 2.056354153415596, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.93418502807617, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8602992296218872, "num_tokens": 616713628.0, "step": 16165 }, { "epoch": 2.0564813636941865, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.27058792114258, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8605863451957703, "num_tokens": 616755985.0, "step": 16166 }, { "epoch": 2.056608573972777, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.221805572509766, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8643463253974915, "num_tokens": 616786564.0, "step": 16167 }, { "epoch": 2.0567357842513676, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.02595520019531, "learning_rate": 1e-06, "loss": 0.6685, "mean_token_accuracy": 0.8328181505203247, "num_tokens": 616823285.0, "step": 16168 }, { "epoch": 2.056862994529958, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.937705993652344, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8712497353553772, "num_tokens": 616864465.0, "step": 16169 }, { "epoch": 2.0569902048085487, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.947914123535156, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8523585796356201, "num_tokens": 616906670.0, "step": 16170 }, { "epoch": 2.057117415087139, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.344627380371094, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8617637157440186, "num_tokens": 616947656.0, "step": 16171 }, { "epoch": 2.0572446253657297, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.76999282836914, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8547564148902893, "num_tokens": 616980161.0, "step": 16172 }, { "epoch": 2.0573718356443202, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.11342239379883, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8625978231430054, "num_tokens": 617015281.0, "step": 16173 }, { "epoch": 2.0574990459229108, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.058563232421875, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8632962107658386, "num_tokens": 617054174.0, "step": 16174 }, { "epoch": 2.057626256201501, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.155487060546875, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.862911581993103, "num_tokens": 617095160.0, "step": 16175 }, { "epoch": 2.0577534664800914, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.92177200317383, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8669140338897705, "num_tokens": 617130781.0, "step": 16176 }, { "epoch": 2.057880676758682, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3119010925293, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8612036108970642, "num_tokens": 617169473.0, "step": 16177 }, { "epoch": 2.0580078870372724, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.705177307128906, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8563923835754395, "num_tokens": 617212208.0, "step": 16178 }, { "epoch": 2.058135097315863, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.92048645019531, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8641741871833801, "num_tokens": 617246055.0, "step": 16179 }, { "epoch": 2.0582623075944535, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.92201232910156, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8584606647491455, "num_tokens": 617285614.0, "step": 16180 }, { "epoch": 2.058389517873044, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.896995544433594, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8705825805664062, "num_tokens": 617323401.0, "step": 16181 }, { "epoch": 2.0585167281516346, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.831844329833984, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8484821319580078, "num_tokens": 617372550.0, "step": 16182 }, { "epoch": 2.058643938430225, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.021629333496094, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8676734566688538, "num_tokens": 617407894.0, "step": 16183 }, { "epoch": 2.0587711487088156, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.18128967285156, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8541466593742371, "num_tokens": 617450117.0, "step": 16184 }, { "epoch": 2.058898358987406, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.67060852050781, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8673890233039856, "num_tokens": 617485885.0, "step": 16185 }, { "epoch": 2.0590255692659967, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.33544921875, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8452011346817017, "num_tokens": 617520511.0, "step": 16186 }, { "epoch": 2.059152779544587, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.12821960449219, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8552061319351196, "num_tokens": 617558288.0, "step": 16187 }, { "epoch": 2.0592799898231777, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 45.99102020263672, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8624368906021118, "num_tokens": 617602991.0, "step": 16188 }, { "epoch": 2.0594072001017683, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.16297149658203, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8624125123023987, "num_tokens": 617639629.0, "step": 16189 }, { "epoch": 2.059534410380359, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.16166305541992, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8570968508720398, "num_tokens": 617676218.0, "step": 16190 }, { "epoch": 2.0596616206589493, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.89189910888672, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8552896976470947, "num_tokens": 617716270.0, "step": 16191 }, { "epoch": 2.05978883093754, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.34092330932617, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8712441921234131, "num_tokens": 617755559.0, "step": 16192 }, { "epoch": 2.0599160412161304, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.651554107666016, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8660276532173157, "num_tokens": 617793522.0, "step": 16193 }, { "epoch": 2.060043251494721, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.79508972167969, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8663349747657776, "num_tokens": 617828005.0, "step": 16194 }, { "epoch": 2.0601704617733114, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.73648452758789, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8585122227668762, "num_tokens": 617863824.0, "step": 16195 }, { "epoch": 2.060297672051902, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.19172286987305, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8596996665000916, "num_tokens": 617903648.0, "step": 16196 }, { "epoch": 2.0604248823304925, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.0120849609375, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8587020635604858, "num_tokens": 617946229.0, "step": 16197 }, { "epoch": 2.060552092609083, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.97311019897461, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8623957633972168, "num_tokens": 617990486.0, "step": 16198 }, { "epoch": 2.0606793028876735, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.94993209838867, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8765794634819031, "num_tokens": 618031743.0, "step": 16199 }, { "epoch": 2.0608065131662636, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 45.874019622802734, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8590375781059265, "num_tokens": 618070181.0, "step": 16200 }, { "epoch": 2.060933723444854, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.1656608581543, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8676351308822632, "num_tokens": 618106396.0, "step": 16201 }, { "epoch": 2.0610609337234447, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.22574996948242, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8682347536087036, "num_tokens": 618148281.0, "step": 16202 }, { "epoch": 2.061188144002035, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.19456481933594, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8565253615379333, "num_tokens": 618182721.0, "step": 16203 }, { "epoch": 2.0613153542806257, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.19394302368164, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8791363835334778, "num_tokens": 618219497.0, "step": 16204 }, { "epoch": 2.0614425645592163, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.96110534667969, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8804702758789062, "num_tokens": 618258643.0, "step": 16205 }, { "epoch": 2.061569774837807, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.68638610839844, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8778866529464722, "num_tokens": 618292682.0, "step": 16206 }, { "epoch": 2.0616969851163973, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.86142349243164, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8485907316207886, "num_tokens": 618331375.0, "step": 16207 }, { "epoch": 2.061824195394988, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.56870651245117, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8665241599082947, "num_tokens": 618367452.0, "step": 16208 }, { "epoch": 2.0619514056735784, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.94987487792969, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8600107431411743, "num_tokens": 618410470.0, "step": 16209 }, { "epoch": 2.062078615952169, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.23691177368164, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8561269044876099, "num_tokens": 618447524.0, "step": 16210 }, { "epoch": 2.0622058262307594, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.17820739746094, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8439867496490479, "num_tokens": 618481793.0, "step": 16211 }, { "epoch": 2.06233303650935, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.52631378173828, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.868085503578186, "num_tokens": 618517097.0, "step": 16212 }, { "epoch": 2.0624602467879405, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.04367446899414, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.859648585319519, "num_tokens": 618554221.0, "step": 16213 }, { "epoch": 2.062587457066531, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.585899353027344, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8721093535423279, "num_tokens": 618591279.0, "step": 16214 }, { "epoch": 2.0627146673451215, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.37428665161133, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8498245477676392, "num_tokens": 618628168.0, "step": 16215 }, { "epoch": 2.062841877623712, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.5136604309082, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8689285516738892, "num_tokens": 618664580.0, "step": 16216 }, { "epoch": 2.0629690879023026, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.07572555541992, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8504988551139832, "num_tokens": 618699007.0, "step": 16217 }, { "epoch": 2.063096298180893, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.32049560546875, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8648029565811157, "num_tokens": 618736630.0, "step": 16218 }, { "epoch": 2.0632235084594837, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.10691833496094, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8722600340843201, "num_tokens": 618777868.0, "step": 16219 }, { "epoch": 2.063350718738074, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.49056625366211, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.866827666759491, "num_tokens": 618819134.0, "step": 16220 }, { "epoch": 2.0634779290166647, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.2697868347168, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8620707988739014, "num_tokens": 618862231.0, "step": 16221 }, { "epoch": 2.0636051392952552, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.27740478515625, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8571624755859375, "num_tokens": 618906816.0, "step": 16222 }, { "epoch": 2.0637323495738458, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.085147857666016, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8624218702316284, "num_tokens": 618943239.0, "step": 16223 }, { "epoch": 2.0638595598524363, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.44562530517578, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8750861883163452, "num_tokens": 618978641.0, "step": 16224 }, { "epoch": 2.0639867701310264, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.775089263916016, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8637310266494751, "num_tokens": 619020522.0, "step": 16225 }, { "epoch": 2.064113980409617, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.03901672363281, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8744270205497742, "num_tokens": 619060769.0, "step": 16226 }, { "epoch": 2.0642411906882074, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.81248092651367, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8744515776634216, "num_tokens": 619098711.0, "step": 16227 }, { "epoch": 2.064368400966798, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.48767852783203, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8586087226867676, "num_tokens": 619133673.0, "step": 16228 }, { "epoch": 2.0644956112453885, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.252662658691406, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8572598695755005, "num_tokens": 619174554.0, "step": 16229 }, { "epoch": 2.064622821523979, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 45.82842254638672, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8527770638465881, "num_tokens": 619211095.0, "step": 16230 }, { "epoch": 2.0647500318025696, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.7965087890625, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.873243510723114, "num_tokens": 619246346.0, "step": 16231 }, { "epoch": 2.06487724208116, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.51346206665039, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8603050708770752, "num_tokens": 619280629.0, "step": 16232 }, { "epoch": 2.0650044523597506, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.05091094970703, "learning_rate": 1e-06, "loss": 0.647, "mean_token_accuracy": 0.8406741619110107, "num_tokens": 619320928.0, "step": 16233 }, { "epoch": 2.065131662638341, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.79180908203125, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8656487464904785, "num_tokens": 619369747.0, "step": 16234 }, { "epoch": 2.0652588729169317, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.01670837402344, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8660484552383423, "num_tokens": 619408694.0, "step": 16235 }, { "epoch": 2.065386083195522, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.05230712890625, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8715149164199829, "num_tokens": 619446858.0, "step": 16236 }, { "epoch": 2.0655132934741127, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.887088775634766, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8618549108505249, "num_tokens": 619482281.0, "step": 16237 }, { "epoch": 2.0656405037527032, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.578372955322266, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8715029358863831, "num_tokens": 619520309.0, "step": 16238 }, { "epoch": 2.065767714031294, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.27389144897461, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8659964203834534, "num_tokens": 619554662.0, "step": 16239 }, { "epoch": 2.0658949243098843, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.221492767333984, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8591374158859253, "num_tokens": 619599632.0, "step": 16240 }, { "epoch": 2.066022134588475, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.92971420288086, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8772920966148376, "num_tokens": 619639414.0, "step": 16241 }, { "epoch": 2.0661493448670654, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.56874465942383, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8642929792404175, "num_tokens": 619675878.0, "step": 16242 }, { "epoch": 2.066276555145656, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.783939361572266, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8668143153190613, "num_tokens": 619709486.0, "step": 16243 }, { "epoch": 2.0664037654242464, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.205162048339844, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8640619516372681, "num_tokens": 619753576.0, "step": 16244 }, { "epoch": 2.066530975702837, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.102294921875, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8641904592514038, "num_tokens": 619792520.0, "step": 16245 }, { "epoch": 2.0666581859814275, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.23045349121094, "learning_rate": 1e-06, "loss": 0.6673, "mean_token_accuracy": 0.8405447006225586, "num_tokens": 619827871.0, "step": 16246 }, { "epoch": 2.066785396260018, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.546417236328125, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8587505221366882, "num_tokens": 619864527.0, "step": 16247 }, { "epoch": 2.0669126065386085, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 47.08598327636719, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8644397258758545, "num_tokens": 619904934.0, "step": 16248 }, { "epoch": 2.0670398168171986, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.56330108642578, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8716129064559937, "num_tokens": 619945472.0, "step": 16249 }, { "epoch": 2.067167027095789, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.5617790222168, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8760854005813599, "num_tokens": 619981738.0, "step": 16250 }, { "epoch": 2.0672942373743797, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.44800567626953, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8650354743003845, "num_tokens": 620024593.0, "step": 16251 }, { "epoch": 2.06742144765297, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.2474479675293, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8695060610771179, "num_tokens": 620064059.0, "step": 16252 }, { "epoch": 2.0675486579315607, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.5550422668457, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8630568981170654, "num_tokens": 620100028.0, "step": 16253 }, { "epoch": 2.0676758682101513, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.42137145996094, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8762822151184082, "num_tokens": 620135181.0, "step": 16254 }, { "epoch": 2.067803078488742, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.654632568359375, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8639962673187256, "num_tokens": 620174887.0, "step": 16255 }, { "epoch": 2.0679302887673323, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.84550476074219, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8637745380401611, "num_tokens": 620212924.0, "step": 16256 }, { "epoch": 2.068057499045923, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.43234634399414, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8523799180984497, "num_tokens": 620251088.0, "step": 16257 }, { "epoch": 2.0681847093245134, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.70901870727539, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8614704012870789, "num_tokens": 620287888.0, "step": 16258 }, { "epoch": 2.068311919603104, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.63425064086914, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8621196746826172, "num_tokens": 620324937.0, "step": 16259 }, { "epoch": 2.0684391298816944, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.80683898925781, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8644974231719971, "num_tokens": 620363390.0, "step": 16260 }, { "epoch": 2.068566340160285, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.97958755493164, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8697831630706787, "num_tokens": 620403707.0, "step": 16261 }, { "epoch": 2.0686935504388755, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.7527961730957, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8672295808792114, "num_tokens": 620445994.0, "step": 16262 }, { "epoch": 2.068820760717466, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.71265411376953, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8722456097602844, "num_tokens": 620480883.0, "step": 16263 }, { "epoch": 2.0689479709960565, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.8916015625, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.876170814037323, "num_tokens": 620518935.0, "step": 16264 }, { "epoch": 2.069075181274647, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.09648513793945, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8577215671539307, "num_tokens": 620551985.0, "step": 16265 }, { "epoch": 2.0692023915532376, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.69050598144531, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8588700294494629, "num_tokens": 620592809.0, "step": 16266 }, { "epoch": 2.069329601831828, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.42475891113281, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8475075960159302, "num_tokens": 620627951.0, "step": 16267 }, { "epoch": 2.0694568121104187, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.13920211791992, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.868452250957489, "num_tokens": 620660892.0, "step": 16268 }, { "epoch": 2.069584022389009, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.795658111572266, "learning_rate": 1e-06, "loss": 0.6507, "mean_token_accuracy": 0.8382647633552551, "num_tokens": 620698165.0, "step": 16269 }, { "epoch": 2.0697112326675997, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.736244201660156, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8531962633132935, "num_tokens": 620734758.0, "step": 16270 }, { "epoch": 2.0698384429461902, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.482566833496094, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.8419923782348633, "num_tokens": 620770224.0, "step": 16271 }, { "epoch": 2.0699656532247808, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.18080139160156, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8707410097122192, "num_tokens": 620805724.0, "step": 16272 }, { "epoch": 2.070092863503371, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.896812438964844, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8513460159301758, "num_tokens": 620847622.0, "step": 16273 }, { "epoch": 2.0702200737819614, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.13143539428711, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.860704779624939, "num_tokens": 620879042.0, "step": 16274 }, { "epoch": 2.070347284060552, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.54543685913086, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8596069812774658, "num_tokens": 620922685.0, "step": 16275 }, { "epoch": 2.0704744943391424, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.45985412597656, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8582776188850403, "num_tokens": 620954854.0, "step": 16276 }, { "epoch": 2.070601704617733, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.06782150268555, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8605624437332153, "num_tokens": 620994581.0, "step": 16277 }, { "epoch": 2.0707289148963235, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.99159240722656, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8632413148880005, "num_tokens": 621030052.0, "step": 16278 }, { "epoch": 2.070856125174914, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.44297409057617, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8717470765113831, "num_tokens": 621072474.0, "step": 16279 }, { "epoch": 2.0709833354535045, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.81732177734375, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8661059141159058, "num_tokens": 621104365.0, "step": 16280 }, { "epoch": 2.071110545732095, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.620338439941406, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.86842280626297, "num_tokens": 621140497.0, "step": 16281 }, { "epoch": 2.0712377560106856, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.92148208618164, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8657252788543701, "num_tokens": 621177431.0, "step": 16282 }, { "epoch": 2.071364966289276, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.70097732543945, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8680474758148193, "num_tokens": 621212874.0, "step": 16283 }, { "epoch": 2.0714921765678667, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.63230514526367, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8685324788093567, "num_tokens": 621249927.0, "step": 16284 }, { "epoch": 2.071619386846457, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.76111602783203, "learning_rate": 1e-06, "loss": 0.6427, "mean_token_accuracy": 0.8376800417900085, "num_tokens": 621280794.0, "step": 16285 }, { "epoch": 2.0717465971250477, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.97779846191406, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8458404541015625, "num_tokens": 621321167.0, "step": 16286 }, { "epoch": 2.0718738074036382, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.89365768432617, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8730686902999878, "num_tokens": 621357962.0, "step": 16287 }, { "epoch": 2.0720010176822288, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.03993225097656, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8807299733161926, "num_tokens": 621398933.0, "step": 16288 }, { "epoch": 2.0721282279608193, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.41474914550781, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8838968276977539, "num_tokens": 621442334.0, "step": 16289 }, { "epoch": 2.07225543823941, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.637962341308594, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8662842512130737, "num_tokens": 621478706.0, "step": 16290 }, { "epoch": 2.0723826485180004, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.47783279418945, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8694429993629456, "num_tokens": 621513911.0, "step": 16291 }, { "epoch": 2.072509858796591, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.2148551940918, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8571997880935669, "num_tokens": 621554080.0, "step": 16292 }, { "epoch": 2.0726370690751814, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.99546432495117, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8658676147460938, "num_tokens": 621592059.0, "step": 16293 }, { "epoch": 2.072764279353772, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 45.8923454284668, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8770702481269836, "num_tokens": 621629626.0, "step": 16294 }, { "epoch": 2.0728914896323625, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.4367790222168, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8679283261299133, "num_tokens": 621671341.0, "step": 16295 }, { "epoch": 2.073018699910953, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 45.68209457397461, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8794794082641602, "num_tokens": 621708756.0, "step": 16296 }, { "epoch": 2.0731459101895435, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.25931167602539, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8686070442199707, "num_tokens": 621748009.0, "step": 16297 }, { "epoch": 2.0732731204681336, "ewc_loss": 0.1669921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014495849609375, "grad_norm": 45.44770050048828, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8717256784439087, "num_tokens": 621788695.0, "step": 16298 }, { "epoch": 2.073400330746724, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.57207107543945, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8621208071708679, "num_tokens": 621828099.0, "step": 16299 }, { "epoch": 2.0735275410253147, "ewc_loss": 0.1689453125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001468658447265625, "grad_norm": 46.35737991333008, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8466688394546509, "num_tokens": 621867033.0, "step": 16300 }, { "epoch": 2.073654751303905, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.545475006103516, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8535974621772766, "num_tokens": 621907435.0, "step": 16301 }, { "epoch": 2.0737819615824957, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.00010299682617, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8526877164840698, "num_tokens": 621948995.0, "step": 16302 }, { "epoch": 2.0739091718610863, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.545528411865234, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8760191202163696, "num_tokens": 621991109.0, "step": 16303 }, { "epoch": 2.074036382139677, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.00014114379883, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8698776960372925, "num_tokens": 622026585.0, "step": 16304 }, { "epoch": 2.0741635924182673, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.605037689208984, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.871616542339325, "num_tokens": 622065208.0, "step": 16305 }, { "epoch": 2.074290802696858, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.00767135620117, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.853276252746582, "num_tokens": 622104838.0, "step": 16306 }, { "epoch": 2.0744180129754484, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.39208221435547, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8565753698348999, "num_tokens": 622136879.0, "step": 16307 }, { "epoch": 2.074545223254039, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.905025482177734, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8751786947250366, "num_tokens": 622175938.0, "step": 16308 }, { "epoch": 2.0746724335326294, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.76717758178711, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8706333041191101, "num_tokens": 622216663.0, "step": 16309 }, { "epoch": 2.07479964381122, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.730587005615234, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.870914876461029, "num_tokens": 622250450.0, "step": 16310 }, { "epoch": 2.0749268540898105, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.09103012084961, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8556724190711975, "num_tokens": 622291919.0, "step": 16311 }, { "epoch": 2.075054064368401, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.48625564575195, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8634417057037354, "num_tokens": 622325892.0, "step": 16312 }, { "epoch": 2.0751812746469915, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.17470932006836, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8624790906906128, "num_tokens": 622356966.0, "step": 16313 }, { "epoch": 2.075308484925582, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.433555603027344, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8671891093254089, "num_tokens": 622389931.0, "step": 16314 }, { "epoch": 2.0754356952041726, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.174171447753906, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8624321818351746, "num_tokens": 622427817.0, "step": 16315 }, { "epoch": 2.075562905482763, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.393245697021484, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8544265627861023, "num_tokens": 622470353.0, "step": 16316 }, { "epoch": 2.0756901157613536, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.22106170654297, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8809308409690857, "num_tokens": 622507577.0, "step": 16317 }, { "epoch": 2.075817326039944, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.40956497192383, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8573256134986877, "num_tokens": 622544746.0, "step": 16318 }, { "epoch": 2.0759445363185347, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.37311935424805, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8559726476669312, "num_tokens": 622583688.0, "step": 16319 }, { "epoch": 2.0760717465971252, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.210426330566406, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8634908199310303, "num_tokens": 622619911.0, "step": 16320 }, { "epoch": 2.0761989568757158, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.9027099609375, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8720518350601196, "num_tokens": 622665732.0, "step": 16321 }, { "epoch": 2.0763261671543063, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.1043815612793, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8721599578857422, "num_tokens": 622705746.0, "step": 16322 }, { "epoch": 2.0764533774328964, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.35468292236328, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8637257814407349, "num_tokens": 622745481.0, "step": 16323 }, { "epoch": 2.076580587711487, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.61948776245117, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8661184310913086, "num_tokens": 622786910.0, "step": 16324 }, { "epoch": 2.0767077979900774, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.70083236694336, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8596479892730713, "num_tokens": 622826976.0, "step": 16325 }, { "epoch": 2.076835008268668, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.28360366821289, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8645997643470764, "num_tokens": 622858945.0, "step": 16326 }, { "epoch": 2.0769622185472585, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.9727668762207, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8765807747840881, "num_tokens": 622893229.0, "step": 16327 }, { "epoch": 2.077089428825849, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.04572677612305, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8735939264297485, "num_tokens": 622933701.0, "step": 16328 }, { "epoch": 2.0772166391044395, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.88717269897461, "learning_rate": 1e-06, "loss": 0.6196, "mean_token_accuracy": 0.850873589515686, "num_tokens": 622971226.0, "step": 16329 }, { "epoch": 2.07734384938303, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.67182922363281, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8593742251396179, "num_tokens": 623009232.0, "step": 16330 }, { "epoch": 2.0774710596616206, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.37407684326172, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8559612035751343, "num_tokens": 623048005.0, "step": 16331 }, { "epoch": 2.077598269940211, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.98736572265625, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8631559014320374, "num_tokens": 623088474.0, "step": 16332 }, { "epoch": 2.0777254802188017, "ewc_loss": 0.171875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.75236511230469, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8654707074165344, "num_tokens": 623123700.0, "step": 16333 }, { "epoch": 2.077852690497392, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.21002197265625, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8659121990203857, "num_tokens": 623161406.0, "step": 16334 }, { "epoch": 2.0779799007759827, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.53726577758789, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.854433536529541, "num_tokens": 623199770.0, "step": 16335 }, { "epoch": 2.0781071110545732, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.450199127197266, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8663555979728699, "num_tokens": 623234635.0, "step": 16336 }, { "epoch": 2.0782343213331638, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.51882553100586, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.865290641784668, "num_tokens": 623273695.0, "step": 16337 }, { "epoch": 2.0783615316117543, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.33924102783203, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8490387201309204, "num_tokens": 623315278.0, "step": 16338 }, { "epoch": 2.078488741890345, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.45221710205078, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8719558715820312, "num_tokens": 623351753.0, "step": 16339 }, { "epoch": 2.0786159521689354, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.24977111816406, "learning_rate": 1e-06, "loss": 0.6502, "mean_token_accuracy": 0.8444373607635498, "num_tokens": 623387812.0, "step": 16340 }, { "epoch": 2.078743162447526, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.0052604675293, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8673645853996277, "num_tokens": 623426784.0, "step": 16341 }, { "epoch": 2.0788703727261164, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.887245178222656, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.8420343399047852, "num_tokens": 623463362.0, "step": 16342 }, { "epoch": 2.078997583004707, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.839569091796875, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8562636375427246, "num_tokens": 623500108.0, "step": 16343 }, { "epoch": 2.0791247932832975, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.350406646728516, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8606103658676147, "num_tokens": 623537159.0, "step": 16344 }, { "epoch": 2.079252003561888, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.004886627197266, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8824068307876587, "num_tokens": 623576256.0, "step": 16345 }, { "epoch": 2.0793792138404785, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.697174072265625, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8582419753074646, "num_tokens": 623612454.0, "step": 16346 }, { "epoch": 2.0795064241190686, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.50937271118164, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.877558708190918, "num_tokens": 623654392.0, "step": 16347 }, { "epoch": 2.079633634397659, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.597347259521484, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8447117805480957, "num_tokens": 623697350.0, "step": 16348 }, { "epoch": 2.0797608446762497, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.09701919555664, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.867108166217804, "num_tokens": 623736272.0, "step": 16349 }, { "epoch": 2.07988805495484, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.95105743408203, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8554143309593201, "num_tokens": 623781319.0, "step": 16350 }, { "epoch": 2.0800152652334307, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.792118072509766, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8632886409759521, "num_tokens": 623821944.0, "step": 16351 }, { "epoch": 2.0801424755120212, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.92778396606445, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8592061996459961, "num_tokens": 623858399.0, "step": 16352 }, { "epoch": 2.0802696857906118, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.200260162353516, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8648660182952881, "num_tokens": 623897839.0, "step": 16353 }, { "epoch": 2.0803968960692023, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.20438003540039, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8461545705795288, "num_tokens": 623933127.0, "step": 16354 }, { "epoch": 2.080524106347793, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.26987075805664, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8618483543395996, "num_tokens": 623972140.0, "step": 16355 }, { "epoch": 2.0806513166263834, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.23158264160156, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8528097867965698, "num_tokens": 624008875.0, "step": 16356 }, { "epoch": 2.080778526904974, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.93031311035156, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8612831830978394, "num_tokens": 624045162.0, "step": 16357 }, { "epoch": 2.0809057371835644, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.50061798095703, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8748581409454346, "num_tokens": 624085275.0, "step": 16358 }, { "epoch": 2.081032947462155, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.64242935180664, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8612135648727417, "num_tokens": 624127479.0, "step": 16359 }, { "epoch": 2.0811601577407455, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.17270278930664, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.86536705493927, "num_tokens": 624165231.0, "step": 16360 }, { "epoch": 2.081287368019336, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.514625549316406, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8531001806259155, "num_tokens": 624205109.0, "step": 16361 }, { "epoch": 2.0814145782979265, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.906497955322266, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8584756851196289, "num_tokens": 624241849.0, "step": 16362 }, { "epoch": 2.081541788576517, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.410614013671875, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8632134795188904, "num_tokens": 624274174.0, "step": 16363 }, { "epoch": 2.0816689988551076, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.515777587890625, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8619231581687927, "num_tokens": 624311526.0, "step": 16364 }, { "epoch": 2.081796209133698, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.762664794921875, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8484660983085632, "num_tokens": 624346657.0, "step": 16365 }, { "epoch": 2.0819234194122886, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.62456512451172, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8473500609397888, "num_tokens": 624386563.0, "step": 16366 }, { "epoch": 2.082050629690879, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.47861862182617, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8723636269569397, "num_tokens": 624427299.0, "step": 16367 }, { "epoch": 2.0821778399694697, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.38637924194336, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8454066514968872, "num_tokens": 624461253.0, "step": 16368 }, { "epoch": 2.0823050502480602, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.50019073486328, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8710181713104248, "num_tokens": 624498699.0, "step": 16369 }, { "epoch": 2.0824322605266508, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.59860610961914, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.8607202768325806, "num_tokens": 624534655.0, "step": 16370 }, { "epoch": 2.082559470805241, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.64988327026367, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8667340278625488, "num_tokens": 624575342.0, "step": 16371 }, { "epoch": 2.0826866810838314, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.352115631103516, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8673399686813354, "num_tokens": 624608493.0, "step": 16372 }, { "epoch": 2.082813891362422, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.5081787109375, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8443222641944885, "num_tokens": 624655777.0, "step": 16373 }, { "epoch": 2.0829411016410124, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.48140335083008, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8729993104934692, "num_tokens": 624699349.0, "step": 16374 }, { "epoch": 2.083068311919603, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.72040557861328, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8693447113037109, "num_tokens": 624730146.0, "step": 16375 }, { "epoch": 2.0831955221981935, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.20380783081055, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8546559810638428, "num_tokens": 624773186.0, "step": 16376 }, { "epoch": 2.083322732476784, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.20576095581055, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8599392771720886, "num_tokens": 624810256.0, "step": 16377 }, { "epoch": 2.0834499427553745, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.44974136352539, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8532471656799316, "num_tokens": 624848166.0, "step": 16378 }, { "epoch": 2.083577153033965, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3499641418457, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.860641360282898, "num_tokens": 624887601.0, "step": 16379 }, { "epoch": 2.0837043633125556, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.514400482177734, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8620554804801941, "num_tokens": 624921160.0, "step": 16380 }, { "epoch": 2.083831573591146, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.222110748291016, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.855400025844574, "num_tokens": 624959053.0, "step": 16381 }, { "epoch": 2.0839587838697367, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 45.99612045288086, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8769954442977905, "num_tokens": 624995272.0, "step": 16382 }, { "epoch": 2.084085994148327, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.378753662109375, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8797481060028076, "num_tokens": 625033388.0, "step": 16383 }, { "epoch": 2.0842132044269177, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.09376525878906, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8521226644515991, "num_tokens": 625071103.0, "step": 16384 }, { "epoch": 2.0843404147055082, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.8248291015625, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.87450110912323, "num_tokens": 625109562.0, "step": 16385 }, { "epoch": 2.0844676249840988, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.16908264160156, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8677831888198853, "num_tokens": 625147438.0, "step": 16386 }, { "epoch": 2.0845948352626893, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.63943862915039, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8723570108413696, "num_tokens": 625184293.0, "step": 16387 }, { "epoch": 2.08472204554128, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.67420959472656, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8641282916069031, "num_tokens": 625218201.0, "step": 16388 }, { "epoch": 2.0848492558198704, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.22785568237305, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8611464500427246, "num_tokens": 625252072.0, "step": 16389 }, { "epoch": 2.084976466098461, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.04774475097656, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8760233521461487, "num_tokens": 625286990.0, "step": 16390 }, { "epoch": 2.0851036763770514, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.307273864746094, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8745290040969849, "num_tokens": 625326579.0, "step": 16391 }, { "epoch": 2.085230886655642, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.049278259277344, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.868930995464325, "num_tokens": 625367317.0, "step": 16392 }, { "epoch": 2.0853580969342325, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.260040283203125, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8550226092338562, "num_tokens": 625401047.0, "step": 16393 }, { "epoch": 2.085485307212823, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.854122161865234, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8476203680038452, "num_tokens": 625441878.0, "step": 16394 }, { "epoch": 2.0856125174914135, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.63345718383789, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8605331182479858, "num_tokens": 625477007.0, "step": 16395 }, { "epoch": 2.0857397277700036, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.71043014526367, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.874396026134491, "num_tokens": 625519908.0, "step": 16396 }, { "epoch": 2.085866938048594, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.86443328857422, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8669129014015198, "num_tokens": 625554946.0, "step": 16397 }, { "epoch": 2.0859941483271847, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.07257080078125, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8736907243728638, "num_tokens": 625592121.0, "step": 16398 }, { "epoch": 2.086121358605775, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.692230224609375, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8619800209999084, "num_tokens": 625630596.0, "step": 16399 }, { "epoch": 2.0862485688843657, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.62361145019531, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8457950353622437, "num_tokens": 625669535.0, "step": 16400 }, { "epoch": 2.0863757791629562, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.7552490234375, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8612816333770752, "num_tokens": 625710072.0, "step": 16401 }, { "epoch": 2.0865029894415468, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.12744140625, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.871728777885437, "num_tokens": 625746328.0, "step": 16402 }, { "epoch": 2.0866301997201373, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.09043884277344, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8640459179878235, "num_tokens": 625787108.0, "step": 16403 }, { "epoch": 2.086757409998728, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.92511749267578, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8665382862091064, "num_tokens": 625827887.0, "step": 16404 }, { "epoch": 2.0868846202773184, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.58282470703125, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.862506091594696, "num_tokens": 625863475.0, "step": 16405 }, { "epoch": 2.087011830555909, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.10268783569336, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.87895268201828, "num_tokens": 625902369.0, "step": 16406 }, { "epoch": 2.0871390408344994, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.237361907958984, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8366588354110718, "num_tokens": 625941054.0, "step": 16407 }, { "epoch": 2.08726625111309, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.77256774902344, "learning_rate": 1e-06, "loss": 0.6523, "mean_token_accuracy": 0.8435853719711304, "num_tokens": 625981113.0, "step": 16408 }, { "epoch": 2.0873934613916805, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.510982513427734, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8578537702560425, "num_tokens": 626014758.0, "step": 16409 }, { "epoch": 2.087520671670271, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.49416732788086, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8470982313156128, "num_tokens": 626054442.0, "step": 16410 }, { "epoch": 2.0876478819488615, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.13676452636719, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8432992696762085, "num_tokens": 626093993.0, "step": 16411 }, { "epoch": 2.087775092227452, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.32893371582031, "learning_rate": 1e-06, "loss": 0.6422, "mean_token_accuracy": 0.8459206819534302, "num_tokens": 626131242.0, "step": 16412 }, { "epoch": 2.0879023025060426, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.05294418334961, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8476191163063049, "num_tokens": 626162137.0, "step": 16413 }, { "epoch": 2.088029512784633, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.854942321777344, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8635355234146118, "num_tokens": 626201031.0, "step": 16414 }, { "epoch": 2.0881567230632236, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.43532943725586, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8718297481536865, "num_tokens": 626243647.0, "step": 16415 }, { "epoch": 2.088283933341814, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.59600830078125, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8680640459060669, "num_tokens": 626281346.0, "step": 16416 }, { "epoch": 2.0884111436204047, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.27214050292969, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8723580837249756, "num_tokens": 626315964.0, "step": 16417 }, { "epoch": 2.0885383538989952, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.97201156616211, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8563646078109741, "num_tokens": 626367920.0, "step": 16418 }, { "epoch": 2.0886655641775858, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.20647048950195, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8590656518936157, "num_tokens": 626412123.0, "step": 16419 }, { "epoch": 2.0887927744561763, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.76211166381836, "learning_rate": 1e-06, "loss": 0.6734, "mean_token_accuracy": 0.8327668905258179, "num_tokens": 626451560.0, "step": 16420 }, { "epoch": 2.0889199847347664, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.08000564575195, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8558361530303955, "num_tokens": 626489361.0, "step": 16421 }, { "epoch": 2.089047195013357, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.87464904785156, "learning_rate": 1e-06, "loss": 0.5424, "mean_token_accuracy": 0.8783299326896667, "num_tokens": 626527150.0, "step": 16422 }, { "epoch": 2.0891744052919474, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.03082275390625, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8544750809669495, "num_tokens": 626568110.0, "step": 16423 }, { "epoch": 2.089301615570538, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.262508392333984, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8634085655212402, "num_tokens": 626607215.0, "step": 16424 }, { "epoch": 2.0894288258491285, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.755489349365234, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8674907684326172, "num_tokens": 626644424.0, "step": 16425 }, { "epoch": 2.089556036127719, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.400238037109375, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8413971662521362, "num_tokens": 626689509.0, "step": 16426 }, { "epoch": 2.0896832464063095, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.89054870605469, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8499157428741455, "num_tokens": 626733866.0, "step": 16427 }, { "epoch": 2.0898104566849, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.5003776550293, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8705255389213562, "num_tokens": 626767465.0, "step": 16428 }, { "epoch": 2.0899376669634906, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.396785736083984, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8477047681808472, "num_tokens": 626806049.0, "step": 16429 }, { "epoch": 2.090064877242081, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.982398986816406, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.848281741142273, "num_tokens": 626848917.0, "step": 16430 }, { "epoch": 2.0901920875206716, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.626197814941406, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.84930419921875, "num_tokens": 626884442.0, "step": 16431 }, { "epoch": 2.090319297799262, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.73555374145508, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8592469096183777, "num_tokens": 626922517.0, "step": 16432 }, { "epoch": 2.0904465080778527, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3431510925293, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8605835437774658, "num_tokens": 626962880.0, "step": 16433 }, { "epoch": 2.0905737183564432, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.06977844238281, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8690974116325378, "num_tokens": 627008115.0, "step": 16434 }, { "epoch": 2.0907009286350338, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 45.85903549194336, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8746292591094971, "num_tokens": 627043935.0, "step": 16435 }, { "epoch": 2.0908281389136243, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.89128875732422, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8622003793716431, "num_tokens": 627078495.0, "step": 16436 }, { "epoch": 2.090955349192215, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.11790466308594, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8654291033744812, "num_tokens": 627116652.0, "step": 16437 }, { "epoch": 2.0910825594708053, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.582061767578125, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8693379759788513, "num_tokens": 627152326.0, "step": 16438 }, { "epoch": 2.091209769749396, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.55893325805664, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.872588038444519, "num_tokens": 627189502.0, "step": 16439 }, { "epoch": 2.0913369800279864, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.22611618041992, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8757134675979614, "num_tokens": 627225438.0, "step": 16440 }, { "epoch": 2.091464190306577, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.3704719543457, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8590803742408752, "num_tokens": 627258154.0, "step": 16441 }, { "epoch": 2.0915914005851675, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.86422348022461, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8547465801239014, "num_tokens": 627301804.0, "step": 16442 }, { "epoch": 2.091718610863758, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.55422592163086, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8663305044174194, "num_tokens": 627344301.0, "step": 16443 }, { "epoch": 2.0918458211423485, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.83765411376953, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8692476153373718, "num_tokens": 627381068.0, "step": 16444 }, { "epoch": 2.0919730314209386, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.37242126464844, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8548535704612732, "num_tokens": 627423410.0, "step": 16445 }, { "epoch": 2.092100241699529, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.403175354003906, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8710860013961792, "num_tokens": 627456593.0, "step": 16446 }, { "epoch": 2.0922274519781197, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.60141372680664, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8644979000091553, "num_tokens": 627493742.0, "step": 16447 }, { "epoch": 2.09235466225671, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.78068923950195, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8677947521209717, "num_tokens": 627529809.0, "step": 16448 }, { "epoch": 2.0924818725353007, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 45.68660354614258, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8655388951301575, "num_tokens": 627570693.0, "step": 16449 }, { "epoch": 2.0926090828138912, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.4057731628418, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8709863424301147, "num_tokens": 627607608.0, "step": 16450 }, { "epoch": 2.0927362930924818, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 45.935874938964844, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8603901863098145, "num_tokens": 627642102.0, "step": 16451 }, { "epoch": 2.0928635033710723, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.418460845947266, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8550827503204346, "num_tokens": 627678838.0, "step": 16452 }, { "epoch": 2.092990713649663, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.16512680053711, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8665346503257751, "num_tokens": 627714952.0, "step": 16453 }, { "epoch": 2.0931179239282534, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.513614654541016, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8526531457901001, "num_tokens": 627754710.0, "step": 16454 }, { "epoch": 2.093245134206844, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 45.941497802734375, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8661757111549377, "num_tokens": 627791504.0, "step": 16455 }, { "epoch": 2.0933723444854344, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.14919662475586, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8766351342201233, "num_tokens": 627829683.0, "step": 16456 }, { "epoch": 2.093499554764025, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.41259002685547, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8636899590492249, "num_tokens": 627869826.0, "step": 16457 }, { "epoch": 2.0936267650426155, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.76066589355469, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8582220077514648, "num_tokens": 627904165.0, "step": 16458 }, { "epoch": 2.093753975321206, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.74281311035156, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8680521845817566, "num_tokens": 627936559.0, "step": 16459 }, { "epoch": 2.0938811855997965, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.67857360839844, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8672729134559631, "num_tokens": 627971742.0, "step": 16460 }, { "epoch": 2.094008395878387, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.62641525268555, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8716749548912048, "num_tokens": 628013104.0, "step": 16461 }, { "epoch": 2.0941356061569776, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.51444625854492, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8593174815177917, "num_tokens": 628049378.0, "step": 16462 }, { "epoch": 2.094262816435568, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.59577178955078, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8804991245269775, "num_tokens": 628087213.0, "step": 16463 }, { "epoch": 2.0943900267141586, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.47993850708008, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8668639659881592, "num_tokens": 628122060.0, "step": 16464 }, { "epoch": 2.094517236992749, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.584266662597656, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8648955821990967, "num_tokens": 628166554.0, "step": 16465 }, { "epoch": 2.0946444472713397, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.092098236083984, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8602882623672485, "num_tokens": 628199953.0, "step": 16466 }, { "epoch": 2.09477165754993, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.825279235839844, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8641064167022705, "num_tokens": 628237044.0, "step": 16467 }, { "epoch": 2.0948988678285207, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.25539016723633, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8644965291023254, "num_tokens": 628275773.0, "step": 16468 }, { "epoch": 2.095026078107111, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.62709045410156, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8473304510116577, "num_tokens": 628314549.0, "step": 16469 }, { "epoch": 2.0951532883857014, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.493892669677734, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8575046062469482, "num_tokens": 628349417.0, "step": 16470 }, { "epoch": 2.095280498664292, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.215919494628906, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8669472932815552, "num_tokens": 628384435.0, "step": 16471 }, { "epoch": 2.0954077089428824, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.11434555053711, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8483003377914429, "num_tokens": 628424124.0, "step": 16472 }, { "epoch": 2.095534919221473, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.202125549316406, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8652788400650024, "num_tokens": 628462412.0, "step": 16473 }, { "epoch": 2.0956621295000635, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.05990219116211, "learning_rate": 1e-06, "loss": 0.6438, "mean_token_accuracy": 0.8454058170318604, "num_tokens": 628499066.0, "step": 16474 }, { "epoch": 2.095789339778654, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.44784927368164, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8772258758544922, "num_tokens": 628532379.0, "step": 16475 }, { "epoch": 2.0959165500572445, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.51277542114258, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8724958300590515, "num_tokens": 628570497.0, "step": 16476 }, { "epoch": 2.096043760335835, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.3944206237793, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8679436445236206, "num_tokens": 628608298.0, "step": 16477 }, { "epoch": 2.0961709706144256, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.07184600830078, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8807730078697205, "num_tokens": 628641217.0, "step": 16478 }, { "epoch": 2.096298180893016, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.423675537109375, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8635126352310181, "num_tokens": 628682480.0, "step": 16479 }, { "epoch": 2.0964253911716066, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.12940979003906, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8602569103240967, "num_tokens": 628718359.0, "step": 16480 }, { "epoch": 2.096552601450197, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.42301940917969, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8709059953689575, "num_tokens": 628752214.0, "step": 16481 }, { "epoch": 2.0966798117287877, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.33405685424805, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8759253621101379, "num_tokens": 628787882.0, "step": 16482 }, { "epoch": 2.0968070220073782, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.66035842895508, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8587336540222168, "num_tokens": 628824910.0, "step": 16483 }, { "epoch": 2.0969342322859688, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.21607208251953, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.874531626701355, "num_tokens": 628860657.0, "step": 16484 }, { "epoch": 2.0970614425645593, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.5616455078125, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8444069027900696, "num_tokens": 628900619.0, "step": 16485 }, { "epoch": 2.09718865284315, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.29071807861328, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8739593625068665, "num_tokens": 628944304.0, "step": 16486 }, { "epoch": 2.0973158631217403, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.295902252197266, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8668703436851501, "num_tokens": 628980490.0, "step": 16487 }, { "epoch": 2.097443073400331, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.551456451416016, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.866816520690918, "num_tokens": 629011924.0, "step": 16488 }, { "epoch": 2.0975702836789214, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.65004348754883, "learning_rate": 1e-06, "loss": 0.6666, "mean_token_accuracy": 0.837221622467041, "num_tokens": 629049514.0, "step": 16489 }, { "epoch": 2.097697493957512, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.47874069213867, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8803614377975464, "num_tokens": 629085374.0, "step": 16490 }, { "epoch": 2.0978247042361025, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.71412658691406, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8603479862213135, "num_tokens": 629126307.0, "step": 16491 }, { "epoch": 2.097951914514693, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.69755554199219, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8631504774093628, "num_tokens": 629159693.0, "step": 16492 }, { "epoch": 2.0980791247932835, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.52135467529297, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8687028884887695, "num_tokens": 629198544.0, "step": 16493 }, { "epoch": 2.0982063350718736, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.10696792602539, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.855192244052887, "num_tokens": 629240956.0, "step": 16494 }, { "epoch": 2.098333545350464, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.08732223510742, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8635272979736328, "num_tokens": 629276196.0, "step": 16495 }, { "epoch": 2.0984607556290547, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.10171127319336, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8500213623046875, "num_tokens": 629311398.0, "step": 16496 }, { "epoch": 2.098587965907645, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.37350845336914, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.85822993516922, "num_tokens": 629349488.0, "step": 16497 }, { "epoch": 2.0987151761862357, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.39024353027344, "learning_rate": 1e-06, "loss": 0.5412, "mean_token_accuracy": 0.8770162463188171, "num_tokens": 629388023.0, "step": 16498 }, { "epoch": 2.0988423864648262, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.32318878173828, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8635120391845703, "num_tokens": 629432140.0, "step": 16499 }, { "epoch": 2.0989695967434168, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.55976104736328, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8490233421325684, "num_tokens": 629480690.0, "step": 16500 }, { "epoch": 2.0990968070220073, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.88827133178711, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8802841305732727, "num_tokens": 629522500.0, "step": 16501 }, { "epoch": 2.099224017300598, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.26280975341797, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8648948669433594, "num_tokens": 629563450.0, "step": 16502 }, { "epoch": 2.0993512275791884, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.42341995239258, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8733575344085693, "num_tokens": 629603753.0, "step": 16503 }, { "epoch": 2.099478437857779, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.763675689697266, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.865670919418335, "num_tokens": 629637828.0, "step": 16504 }, { "epoch": 2.0996056481363694, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.146522521972656, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8701878786087036, "num_tokens": 629674636.0, "step": 16505 }, { "epoch": 2.09973285841496, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.468772888183594, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8534700870513916, "num_tokens": 629717046.0, "step": 16506 }, { "epoch": 2.0998600686935505, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.642669677734375, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8730912804603577, "num_tokens": 629750331.0, "step": 16507 }, { "epoch": 2.099987278972141, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.33629608154297, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8443094491958618, "num_tokens": 629790095.0, "step": 16508 }, { "epoch": 2.1001144892507315, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.793941497802734, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8780504465103149, "num_tokens": 629823035.0, "step": 16509 }, { "epoch": 2.100241699529322, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.20546340942383, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8581953048706055, "num_tokens": 629857096.0, "step": 16510 }, { "epoch": 2.1003689098079126, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.83417510986328, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8624585270881653, "num_tokens": 629893448.0, "step": 16511 }, { "epoch": 2.100496120086503, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.467464447021484, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8614557981491089, "num_tokens": 629931301.0, "step": 16512 }, { "epoch": 2.1006233303650936, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.622474670410156, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8696883916854858, "num_tokens": 629963481.0, "step": 16513 }, { "epoch": 2.100750540643684, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.641387939453125, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.85854172706604, "num_tokens": 630002739.0, "step": 16514 }, { "epoch": 2.1008777509222747, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.41756820678711, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8552465438842773, "num_tokens": 630041206.0, "step": 16515 }, { "epoch": 2.101004961200865, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.77481460571289, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8486440181732178, "num_tokens": 630083811.0, "step": 16516 }, { "epoch": 2.1011321714794557, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.207183837890625, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8618015050888062, "num_tokens": 630119823.0, "step": 16517 }, { "epoch": 2.1012593817580463, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.80698013305664, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8634848594665527, "num_tokens": 630163807.0, "step": 16518 }, { "epoch": 2.1013865920366364, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.79948806762695, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8575401306152344, "num_tokens": 630203038.0, "step": 16519 }, { "epoch": 2.101513802315227, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.54341125488281, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8664560317993164, "num_tokens": 630238505.0, "step": 16520 }, { "epoch": 2.1016410125938174, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.89546203613281, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8620458245277405, "num_tokens": 630275279.0, "step": 16521 }, { "epoch": 2.101768222872408, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.300045013427734, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8600760698318481, "num_tokens": 630317288.0, "step": 16522 }, { "epoch": 2.1018954331509985, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.405609130859375, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8625991940498352, "num_tokens": 630352786.0, "step": 16523 }, { "epoch": 2.102022643429589, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.04388427734375, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8648829460144043, "num_tokens": 630391138.0, "step": 16524 }, { "epoch": 2.1021498537081795, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.32155227661133, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.850278377532959, "num_tokens": 630426567.0, "step": 16525 }, { "epoch": 2.10227706398677, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.948673248291016, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8526410460472107, "num_tokens": 630467308.0, "step": 16526 }, { "epoch": 2.1024042742653606, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.30243682861328, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8709237575531006, "num_tokens": 630503333.0, "step": 16527 }, { "epoch": 2.102531484543951, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.52256774902344, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8573883771896362, "num_tokens": 630536838.0, "step": 16528 }, { "epoch": 2.1026586948225416, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.06770324707031, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.875802755355835, "num_tokens": 630570833.0, "step": 16529 }, { "epoch": 2.102785905101132, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.074283599853516, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8721932172775269, "num_tokens": 630615043.0, "step": 16530 }, { "epoch": 2.1029131153797227, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.64996337890625, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.886143147945404, "num_tokens": 630651368.0, "step": 16531 }, { "epoch": 2.1030403256583132, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.20308303833008, "learning_rate": 1e-06, "loss": 0.5983, "mean_token_accuracy": 0.8542969226837158, "num_tokens": 630690341.0, "step": 16532 }, { "epoch": 2.1031675359369038, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.77583694458008, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8598294258117676, "num_tokens": 630727168.0, "step": 16533 }, { "epoch": 2.1032947462154943, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.75971221923828, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8673728704452515, "num_tokens": 630766737.0, "step": 16534 }, { "epoch": 2.103421956494085, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.543399810791016, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8610306978225708, "num_tokens": 630803721.0, "step": 16535 }, { "epoch": 2.1035491667726753, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.74205780029297, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.866714358329773, "num_tokens": 630843697.0, "step": 16536 }, { "epoch": 2.103676377051266, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.36119842529297, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8622726798057556, "num_tokens": 630874325.0, "step": 16537 }, { "epoch": 2.1038035873298564, "ewc_loss": 0.169921875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014781951904296875, "grad_norm": 46.79259490966797, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8596680164337158, "num_tokens": 630911456.0, "step": 16538 }, { "epoch": 2.103930797608447, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.30534744262695, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8618345260620117, "num_tokens": 630954418.0, "step": 16539 }, { "epoch": 2.1040580078870375, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.631134033203125, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.862706184387207, "num_tokens": 630991826.0, "step": 16540 }, { "epoch": 2.104185218165628, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.444175720214844, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8640656471252441, "num_tokens": 631025831.0, "step": 16541 }, { "epoch": 2.1043124284442185, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 47.04636764526367, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8491917848587036, "num_tokens": 631062896.0, "step": 16542 }, { "epoch": 2.1044396387228086, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.39836502075195, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8691223859786987, "num_tokens": 631099291.0, "step": 16543 }, { "epoch": 2.104566849001399, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.95881652832031, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8539128303527832, "num_tokens": 631141570.0, "step": 16544 }, { "epoch": 2.1046940592799896, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.71759796142578, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8558437824249268, "num_tokens": 631178882.0, "step": 16545 }, { "epoch": 2.10482126955858, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.3082389831543, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8605514764785767, "num_tokens": 631218059.0, "step": 16546 }, { "epoch": 2.1049484798371707, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.55677795410156, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8637712597846985, "num_tokens": 631252827.0, "step": 16547 }, { "epoch": 2.1050756901157612, "ewc_loss": 0.16796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014591217041015625, "grad_norm": 46.62294387817383, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8699198365211487, "num_tokens": 631294370.0, "step": 16548 }, { "epoch": 2.1052029003943518, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.926246643066406, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8594274520874023, "num_tokens": 631331091.0, "step": 16549 }, { "epoch": 2.1053301106729423, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.62596130371094, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8649896383285522, "num_tokens": 631367285.0, "step": 16550 }, { "epoch": 2.105457320951533, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.84123992919922, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8502066135406494, "num_tokens": 631402471.0, "step": 16551 }, { "epoch": 2.1055845312301233, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.93173599243164, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8606483340263367, "num_tokens": 631439706.0, "step": 16552 }, { "epoch": 2.105711741508714, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.74345779418945, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8683057427406311, "num_tokens": 631471437.0, "step": 16553 }, { "epoch": 2.1058389517873044, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 47.165714263916016, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.859411358833313, "num_tokens": 631515028.0, "step": 16554 }, { "epoch": 2.105966162065895, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.89122772216797, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8574669361114502, "num_tokens": 631552946.0, "step": 16555 }, { "epoch": 2.1060933723444855, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.078529357910156, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8673909902572632, "num_tokens": 631596045.0, "step": 16556 }, { "epoch": 2.106220582623076, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.475807189941406, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8720715045928955, "num_tokens": 631630233.0, "step": 16557 }, { "epoch": 2.1063477929016665, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.6307373046875, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.868370771408081, "num_tokens": 631666665.0, "step": 16558 }, { "epoch": 2.106475003180257, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.07706069946289, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8594291806221008, "num_tokens": 631709536.0, "step": 16559 }, { "epoch": 2.1066022134588476, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.56074142456055, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8768600225448608, "num_tokens": 631744333.0, "step": 16560 }, { "epoch": 2.106729423737438, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.40155792236328, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8474652171134949, "num_tokens": 631783522.0, "step": 16561 }, { "epoch": 2.1068566340160286, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.69418716430664, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.8458684682846069, "num_tokens": 631822579.0, "step": 16562 }, { "epoch": 2.106983844294619, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.59304428100586, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8708926439285278, "num_tokens": 631862642.0, "step": 16563 }, { "epoch": 2.1071110545732097, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.48091125488281, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8732960224151611, "num_tokens": 631903161.0, "step": 16564 }, { "epoch": 2.1072382648518, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.419185638427734, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8640099763870239, "num_tokens": 631938149.0, "step": 16565 }, { "epoch": 2.1073654751303907, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.680850982666016, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8643568754196167, "num_tokens": 631974717.0, "step": 16566 }, { "epoch": 2.107492685408981, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.27276611328125, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8667612075805664, "num_tokens": 632011147.0, "step": 16567 }, { "epoch": 2.1076198956875714, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.303470611572266, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8696127533912659, "num_tokens": 632051915.0, "step": 16568 }, { "epoch": 2.107747105966162, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.81930160522461, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8531556129455566, "num_tokens": 632083047.0, "step": 16569 }, { "epoch": 2.1078743162447524, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.32254409790039, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.8740761280059814, "num_tokens": 632119250.0, "step": 16570 }, { "epoch": 2.108001526523343, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.8060302734375, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8704467415809631, "num_tokens": 632152696.0, "step": 16571 }, { "epoch": 2.1081287368019335, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.208763122558594, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8843214511871338, "num_tokens": 632188167.0, "step": 16572 }, { "epoch": 2.108255947080524, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.46752166748047, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8605929613113403, "num_tokens": 632229567.0, "step": 16573 }, { "epoch": 2.1083831573591145, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.43842697143555, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8736785650253296, "num_tokens": 632266870.0, "step": 16574 }, { "epoch": 2.108510367637705, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.743812561035156, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8632063865661621, "num_tokens": 632303844.0, "step": 16575 }, { "epoch": 2.1086375779162956, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.9974250793457, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.865129828453064, "num_tokens": 632339343.0, "step": 16576 }, { "epoch": 2.108764788194886, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.3121452331543, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.867093563079834, "num_tokens": 632385516.0, "step": 16577 }, { "epoch": 2.1088919984734766, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.53262710571289, "learning_rate": 1e-06, "loss": 0.5592, "mean_token_accuracy": 0.8677338361740112, "num_tokens": 632420664.0, "step": 16578 }, { "epoch": 2.109019208752067, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.02199172973633, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8688247203826904, "num_tokens": 632458082.0, "step": 16579 }, { "epoch": 2.1091464190306577, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.39114761352539, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8503996729850769, "num_tokens": 632495963.0, "step": 16580 }, { "epoch": 2.109273629309248, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.509002685546875, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8645036220550537, "num_tokens": 632529968.0, "step": 16581 }, { "epoch": 2.1094008395878387, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.78279495239258, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8719773292541504, "num_tokens": 632567527.0, "step": 16582 }, { "epoch": 2.1095280498664293, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.493629455566406, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8615087866783142, "num_tokens": 632606192.0, "step": 16583 }, { "epoch": 2.10965526014502, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.61307144165039, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.87477046251297, "num_tokens": 632645940.0, "step": 16584 }, { "epoch": 2.1097824704236103, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.37557601928711, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8714349269866943, "num_tokens": 632689385.0, "step": 16585 }, { "epoch": 2.109909680702201, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.79842758178711, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.8685144186019897, "num_tokens": 632720178.0, "step": 16586 }, { "epoch": 2.1100368909807914, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.50090408325195, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.855407178401947, "num_tokens": 632763249.0, "step": 16587 }, { "epoch": 2.110164101259382, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.75782012939453, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8440515995025635, "num_tokens": 632803293.0, "step": 16588 }, { "epoch": 2.1102913115379724, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.80668258666992, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8553304076194763, "num_tokens": 632838731.0, "step": 16589 }, { "epoch": 2.110418521816563, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.11951446533203, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8531976938247681, "num_tokens": 632875044.0, "step": 16590 }, { "epoch": 2.1105457320951535, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.01004409790039, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8431565761566162, "num_tokens": 632911330.0, "step": 16591 }, { "epoch": 2.1106729423737436, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.615509033203125, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8529642820358276, "num_tokens": 632952122.0, "step": 16592 }, { "epoch": 2.110800152652334, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.95032501220703, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8438187837600708, "num_tokens": 632985602.0, "step": 16593 }, { "epoch": 2.1109273629309246, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.63563537597656, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8629657626152039, "num_tokens": 633023792.0, "step": 16594 }, { "epoch": 2.111054573209515, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.23189926147461, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.863850474357605, "num_tokens": 633065457.0, "step": 16595 }, { "epoch": 2.1111817834881057, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.307376861572266, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8720657825469971, "num_tokens": 633101402.0, "step": 16596 }, { "epoch": 2.1113089937666962, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.9902229309082, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8541027307510376, "num_tokens": 633147002.0, "step": 16597 }, { "epoch": 2.1114362040452868, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.324615478515625, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.862217366695404, "num_tokens": 633184209.0, "step": 16598 }, { "epoch": 2.1115634143238773, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.26481246948242, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8680368661880493, "num_tokens": 633224525.0, "step": 16599 }, { "epoch": 2.111690624602468, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.05323791503906, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8755921721458435, "num_tokens": 633253171.0, "step": 16600 }, { "epoch": 2.1118178348810583, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.75055694580078, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8602707386016846, "num_tokens": 633294326.0, "step": 16601 }, { "epoch": 2.111945045159649, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.961936950683594, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8769158720970154, "num_tokens": 633327236.0, "step": 16602 }, { "epoch": 2.1120722554382394, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.42435073852539, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8726267218589783, "num_tokens": 633368593.0, "step": 16603 }, { "epoch": 2.11219946571683, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.52042007446289, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8432040214538574, "num_tokens": 633402598.0, "step": 16604 }, { "epoch": 2.1123266759954205, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.41493606567383, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8586636781692505, "num_tokens": 633437069.0, "step": 16605 }, { "epoch": 2.112453886274011, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.80509567260742, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8654385805130005, "num_tokens": 633479752.0, "step": 16606 }, { "epoch": 2.1125810965526015, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.07320785522461, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8600509166717529, "num_tokens": 633516949.0, "step": 16607 }, { "epoch": 2.112708306831192, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.44114685058594, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8669096827507019, "num_tokens": 633547511.0, "step": 16608 }, { "epoch": 2.1128355171097826, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.06294250488281, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8658645749092102, "num_tokens": 633583940.0, "step": 16609 }, { "epoch": 2.112962727388373, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.471534729003906, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8521971702575684, "num_tokens": 633626245.0, "step": 16610 }, { "epoch": 2.1130899376669636, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.188499450683594, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8666280508041382, "num_tokens": 633662889.0, "step": 16611 }, { "epoch": 2.113217147945554, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.70248794555664, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8597670793533325, "num_tokens": 633701461.0, "step": 16612 }, { "epoch": 2.1133443582241447, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.752708435058594, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8641607761383057, "num_tokens": 633741302.0, "step": 16613 }, { "epoch": 2.113471568502735, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.98228454589844, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.867723822593689, "num_tokens": 633780337.0, "step": 16614 }, { "epoch": 2.1135987787813257, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.45935821533203, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8703135848045349, "num_tokens": 633814450.0, "step": 16615 }, { "epoch": 2.1137259890599163, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.2003173828125, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8542360663414001, "num_tokens": 633856252.0, "step": 16616 }, { "epoch": 2.1138531993385064, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.73773956298828, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8521407842636108, "num_tokens": 633895235.0, "step": 16617 }, { "epoch": 2.113980409617097, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.01136779785156, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8661438822746277, "num_tokens": 633938948.0, "step": 16618 }, { "epoch": 2.1141076198956874, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.97825241088867, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8723129630088806, "num_tokens": 633973393.0, "step": 16619 }, { "epoch": 2.114234830174278, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.29825210571289, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8687523603439331, "num_tokens": 634006714.0, "step": 16620 }, { "epoch": 2.1143620404528685, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.78083801269531, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8685240745544434, "num_tokens": 634046104.0, "step": 16621 }, { "epoch": 2.114489250731459, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.342594146728516, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8565396070480347, "num_tokens": 634086401.0, "step": 16622 }, { "epoch": 2.1146164610100495, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.763816833496094, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8543208837509155, "num_tokens": 634125479.0, "step": 16623 }, { "epoch": 2.11474367128864, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.410316467285156, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.845212996006012, "num_tokens": 634163873.0, "step": 16624 }, { "epoch": 2.1148708815672306, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.637428283691406, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8616761565208435, "num_tokens": 634200533.0, "step": 16625 }, { "epoch": 2.114998091845821, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.28559112548828, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8555258512496948, "num_tokens": 634235891.0, "step": 16626 }, { "epoch": 2.1151253021244116, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.4896125793457, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8618137836456299, "num_tokens": 634266748.0, "step": 16627 }, { "epoch": 2.115252512403002, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.25101089477539, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8491791486740112, "num_tokens": 634302603.0, "step": 16628 }, { "epoch": 2.1153797226815927, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.793338775634766, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8642887473106384, "num_tokens": 634345471.0, "step": 16629 }, { "epoch": 2.115506932960183, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.32911682128906, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8611760139465332, "num_tokens": 634383902.0, "step": 16630 }, { "epoch": 2.1156341432387737, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.83407211303711, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.854134202003479, "num_tokens": 634421082.0, "step": 16631 }, { "epoch": 2.1157613535173643, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.916709899902344, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8583697080612183, "num_tokens": 634456490.0, "step": 16632 }, { "epoch": 2.115888563795955, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.3415412902832, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.8449540734291077, "num_tokens": 634498252.0, "step": 16633 }, { "epoch": 2.1160157740745453, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.5853271484375, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8716802597045898, "num_tokens": 634540769.0, "step": 16634 }, { "epoch": 2.116142984353136, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.51313781738281, "learning_rate": 1e-06, "loss": 0.5506, "mean_token_accuracy": 0.8700141906738281, "num_tokens": 634573807.0, "step": 16635 }, { "epoch": 2.1162701946317264, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.81344985961914, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8585759401321411, "num_tokens": 634609542.0, "step": 16636 }, { "epoch": 2.116397404910317, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.157020568847656, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8696569204330444, "num_tokens": 634648641.0, "step": 16637 }, { "epoch": 2.1165246151889074, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.952754974365234, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8611130714416504, "num_tokens": 634685531.0, "step": 16638 }, { "epoch": 2.116651825467498, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.05123519897461, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8642643690109253, "num_tokens": 634717239.0, "step": 16639 }, { "epoch": 2.116779035746088, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.903053283691406, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8592438697814941, "num_tokens": 634750058.0, "step": 16640 }, { "epoch": 2.1169062460246786, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.308738708496094, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.856711745262146, "num_tokens": 634793480.0, "step": 16641 }, { "epoch": 2.117033456303269, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.521968841552734, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8532393574714661, "num_tokens": 634829923.0, "step": 16642 }, { "epoch": 2.1171606665818596, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.82660675048828, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8584167957305908, "num_tokens": 634868499.0, "step": 16643 }, { "epoch": 2.11728787686045, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.226226806640625, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8708239793777466, "num_tokens": 634905922.0, "step": 16644 }, { "epoch": 2.1174150871390407, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.73685836791992, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8640683889389038, "num_tokens": 634946190.0, "step": 16645 }, { "epoch": 2.1175422974176312, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.389339447021484, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8751019835472107, "num_tokens": 634981840.0, "step": 16646 }, { "epoch": 2.1176695076962218, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.587158203125, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8595690727233887, "num_tokens": 635024755.0, "step": 16647 }, { "epoch": 2.1177967179748123, "ewc_loss": 0.171875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.890113830566406, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8671514987945557, "num_tokens": 635063076.0, "step": 16648 }, { "epoch": 2.117923928253403, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.36231994628906, "learning_rate": 1e-06, "loss": 0.5397, "mean_token_accuracy": 0.8745648264884949, "num_tokens": 635103052.0, "step": 16649 }, { "epoch": 2.1180511385319933, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.67461395263672, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8717422485351562, "num_tokens": 635140917.0, "step": 16650 }, { "epoch": 2.118178348810584, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.29132843017578, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.868069052696228, "num_tokens": 635182049.0, "step": 16651 }, { "epoch": 2.1183055590891744, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.09246063232422, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8566010594367981, "num_tokens": 635217613.0, "step": 16652 }, { "epoch": 2.118432769367765, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.756690979003906, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8724135756492615, "num_tokens": 635252716.0, "step": 16653 }, { "epoch": 2.1185599796463555, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.1921272277832, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8659621477127075, "num_tokens": 635288919.0, "step": 16654 }, { "epoch": 2.118687189924946, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.83097839355469, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8576693534851074, "num_tokens": 635329781.0, "step": 16655 }, { "epoch": 2.1188144002035365, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.640438079833984, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8713160753250122, "num_tokens": 635363844.0, "step": 16656 }, { "epoch": 2.118941610482127, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.84676742553711, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8596395254135132, "num_tokens": 635397568.0, "step": 16657 }, { "epoch": 2.1190688207607176, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.52315139770508, "learning_rate": 1e-06, "loss": 0.6214, "mean_token_accuracy": 0.8499223589897156, "num_tokens": 635432903.0, "step": 16658 }, { "epoch": 2.119196031039308, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.58037567138672, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8642477989196777, "num_tokens": 635473388.0, "step": 16659 }, { "epoch": 2.1193232413178986, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.56769561767578, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8662123680114746, "num_tokens": 635512966.0, "step": 16660 }, { "epoch": 2.119450451596489, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.91630554199219, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8481553792953491, "num_tokens": 635547814.0, "step": 16661 }, { "epoch": 2.1195776618750797, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.132293701171875, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8515311479568481, "num_tokens": 635583120.0, "step": 16662 }, { "epoch": 2.11970487215367, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.99544906616211, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8576940298080444, "num_tokens": 635621531.0, "step": 16663 }, { "epoch": 2.1198320824322607, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.788692474365234, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.8501137495040894, "num_tokens": 635659641.0, "step": 16664 }, { "epoch": 2.119959292710851, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.33816146850586, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.857787013053894, "num_tokens": 635697953.0, "step": 16665 }, { "epoch": 2.1200865029894413, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.361289978027344, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8383193612098694, "num_tokens": 635742519.0, "step": 16666 }, { "epoch": 2.120213713268032, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.906272888183594, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8704444169998169, "num_tokens": 635777786.0, "step": 16667 }, { "epoch": 2.1203409235466224, "ewc_loss": 0.1708984375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000148773193359375, "grad_norm": 46.50545883178711, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8644176125526428, "num_tokens": 635814490.0, "step": 16668 }, { "epoch": 2.120468133825213, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.34827423095703, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8496320247650146, "num_tokens": 635858351.0, "step": 16669 }, { "epoch": 2.1205953441038035, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.97359848022461, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8575201034545898, "num_tokens": 635895585.0, "step": 16670 }, { "epoch": 2.120722554382394, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.045230865478516, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8690621852874756, "num_tokens": 635933591.0, "step": 16671 }, { "epoch": 2.1208497646609845, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.463714599609375, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8599724173545837, "num_tokens": 635972040.0, "step": 16672 }, { "epoch": 2.120976974939575, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.435264587402344, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8658881783485413, "num_tokens": 636013287.0, "step": 16673 }, { "epoch": 2.1211041852181656, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.22001647949219, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8582576513290405, "num_tokens": 636046137.0, "step": 16674 }, { "epoch": 2.121231395496756, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.775821685791016, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8521897792816162, "num_tokens": 636088870.0, "step": 16675 }, { "epoch": 2.1213586057753466, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.08100128173828, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8709084987640381, "num_tokens": 636130519.0, "step": 16676 }, { "epoch": 2.121485816053937, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.854557037353516, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8653271198272705, "num_tokens": 636165641.0, "step": 16677 }, { "epoch": 2.1216130263325277, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.26568603515625, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8476185202598572, "num_tokens": 636203811.0, "step": 16678 }, { "epoch": 2.121740236611118, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.905460357666016, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8744248747825623, "num_tokens": 636239503.0, "step": 16679 }, { "epoch": 2.1218674468897087, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.60417938232422, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8658708930015564, "num_tokens": 636278994.0, "step": 16680 }, { "epoch": 2.1219946571682993, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.43252944946289, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8589482307434082, "num_tokens": 636318025.0, "step": 16681 }, { "epoch": 2.12212186744689, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.346290588378906, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8648073077201843, "num_tokens": 636357289.0, "step": 16682 }, { "epoch": 2.1222490777254803, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.09196853637695, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8459997773170471, "num_tokens": 636399067.0, "step": 16683 }, { "epoch": 2.122376288004071, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.94636917114258, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8674988746643066, "num_tokens": 636442418.0, "step": 16684 }, { "epoch": 2.1225034982826614, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.4554557800293, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8506489992141724, "num_tokens": 636484273.0, "step": 16685 }, { "epoch": 2.122630708561252, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.6005744934082, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8697571158409119, "num_tokens": 636525675.0, "step": 16686 }, { "epoch": 2.1227579188398424, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.38302993774414, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8609278202056885, "num_tokens": 636559652.0, "step": 16687 }, { "epoch": 2.122885129118433, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.40114212036133, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8821223974227905, "num_tokens": 636597900.0, "step": 16688 }, { "epoch": 2.1230123393970235, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.51461410522461, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8628751039505005, "num_tokens": 636635822.0, "step": 16689 }, { "epoch": 2.1231395496756136, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.481868743896484, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8510476350784302, "num_tokens": 636667275.0, "step": 16690 }, { "epoch": 2.123266759954204, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.34178161621094, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8643324375152588, "num_tokens": 636704723.0, "step": 16691 }, { "epoch": 2.1233939702327946, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.557498931884766, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8673489093780518, "num_tokens": 636748209.0, "step": 16692 }, { "epoch": 2.123521180511385, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.168495178222656, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.868572473526001, "num_tokens": 636785052.0, "step": 16693 }, { "epoch": 2.1236483907899757, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.62578582763672, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8751204013824463, "num_tokens": 636816073.0, "step": 16694 }, { "epoch": 2.123775601068566, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.754024505615234, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8683013916015625, "num_tokens": 636848963.0, "step": 16695 }, { "epoch": 2.1239028113471567, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.95706558227539, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8746943473815918, "num_tokens": 636880632.0, "step": 16696 }, { "epoch": 2.1240300216257473, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.01274108886719, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8502649664878845, "num_tokens": 636923597.0, "step": 16697 }, { "epoch": 2.124157231904338, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.02715301513672, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8560565710067749, "num_tokens": 636962250.0, "step": 16698 }, { "epoch": 2.1242844421829283, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.27482223510742, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8711408972740173, "num_tokens": 636997973.0, "step": 16699 }, { "epoch": 2.124411652461519, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.03941345214844, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8499525189399719, "num_tokens": 637037320.0, "step": 16700 }, { "epoch": 2.1245388627401094, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.78473663330078, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8390812873840332, "num_tokens": 637072267.0, "step": 16701 }, { "epoch": 2.1246660730187, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.85770034790039, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8780162334442139, "num_tokens": 637108782.0, "step": 16702 }, { "epoch": 2.1247932832972904, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.30030822753906, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8609971404075623, "num_tokens": 637147368.0, "step": 16703 }, { "epoch": 2.124920493575881, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.594032287597656, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8712188601493835, "num_tokens": 637181909.0, "step": 16704 }, { "epoch": 2.1250477038544715, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.11482238769531, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8611328601837158, "num_tokens": 637218498.0, "step": 16705 }, { "epoch": 2.125174914133062, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.912532806396484, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8636484146118164, "num_tokens": 637254825.0, "step": 16706 }, { "epoch": 2.1253021244116526, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.150203704833984, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8534455895423889, "num_tokens": 637301593.0, "step": 16707 }, { "epoch": 2.125429334690243, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.2109489440918, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8750520944595337, "num_tokens": 637341676.0, "step": 16708 }, { "epoch": 2.1255565449688336, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.16327667236328, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8680078983306885, "num_tokens": 637378840.0, "step": 16709 }, { "epoch": 2.125683755247424, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.46384811401367, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8456805944442749, "num_tokens": 637412711.0, "step": 16710 }, { "epoch": 2.1258109655260147, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.264347076416016, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8610809445381165, "num_tokens": 637448287.0, "step": 16711 }, { "epoch": 2.125938175804605, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.03749465942383, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8609722852706909, "num_tokens": 637487899.0, "step": 16712 }, { "epoch": 2.1260653860831957, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.55366134643555, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8681812286376953, "num_tokens": 637528161.0, "step": 16713 }, { "epoch": 2.1261925963617863, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.02565383911133, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8507930040359497, "num_tokens": 637565692.0, "step": 16714 }, { "epoch": 2.1263198066403763, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.146060943603516, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8501603007316589, "num_tokens": 637600368.0, "step": 16715 }, { "epoch": 2.126447016918967, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.0552978515625, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8591451644897461, "num_tokens": 637635732.0, "step": 16716 }, { "epoch": 2.1265742271975574, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.364051818847656, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8577321767807007, "num_tokens": 637677127.0, "step": 16717 }, { "epoch": 2.126701437476148, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.783714294433594, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8620953559875488, "num_tokens": 637715278.0, "step": 16718 }, { "epoch": 2.1268286477547385, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.50666046142578, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8656958341598511, "num_tokens": 637742937.0, "step": 16719 }, { "epoch": 2.126955858033329, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 46.943878173828125, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8655709028244019, "num_tokens": 637776761.0, "step": 16720 }, { "epoch": 2.1270830683119195, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.08619689941406, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8650181293487549, "num_tokens": 637812197.0, "step": 16721 }, { "epoch": 2.12721027859051, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.127403259277344, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.868424117565155, "num_tokens": 637850442.0, "step": 16722 }, { "epoch": 2.1273374888691006, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.912723541259766, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8632686138153076, "num_tokens": 637889352.0, "step": 16723 }, { "epoch": 2.127464699147691, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.95225524902344, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8432697653770447, "num_tokens": 637927195.0, "step": 16724 }, { "epoch": 2.1275919094262816, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.79423141479492, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8506031632423401, "num_tokens": 637969431.0, "step": 16725 }, { "epoch": 2.127719119704872, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.33579635620117, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8792937994003296, "num_tokens": 638013910.0, "step": 16726 }, { "epoch": 2.1278463299834627, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.31289291381836, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8509827852249146, "num_tokens": 638051848.0, "step": 16727 }, { "epoch": 2.127973540262053, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.03199005126953, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.86384117603302, "num_tokens": 638092192.0, "step": 16728 }, { "epoch": 2.1281007505406437, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.557952880859375, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8727046847343445, "num_tokens": 638134306.0, "step": 16729 }, { "epoch": 2.1282279608192343, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.80140686035156, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8480719327926636, "num_tokens": 638177988.0, "step": 16730 }, { "epoch": 2.128355171097825, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.17943572998047, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8727327585220337, "num_tokens": 638212237.0, "step": 16731 }, { "epoch": 2.1284823813764153, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.991172790527344, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8557325005531311, "num_tokens": 638252492.0, "step": 16732 }, { "epoch": 2.128609591655006, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.33540344238281, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8614657521247864, "num_tokens": 638294308.0, "step": 16733 }, { "epoch": 2.1287368019335964, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.92565155029297, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.86760413646698, "num_tokens": 638330178.0, "step": 16734 }, { "epoch": 2.128864012212187, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.783138275146484, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.842961311340332, "num_tokens": 638372731.0, "step": 16735 }, { "epoch": 2.1289912224907774, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.25101852416992, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8580002784729004, "num_tokens": 638407522.0, "step": 16736 }, { "epoch": 2.129118432769368, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.95032501220703, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8653687238693237, "num_tokens": 638443262.0, "step": 16737 }, { "epoch": 2.129245643047958, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.49396896362305, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8632562160491943, "num_tokens": 638482729.0, "step": 16738 }, { "epoch": 2.129372853326549, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.83951187133789, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.869255006313324, "num_tokens": 638520034.0, "step": 16739 }, { "epoch": 2.129500063605139, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.2985954284668, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.8702332973480225, "num_tokens": 638558631.0, "step": 16740 }, { "epoch": 2.1296272738837296, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.41423797607422, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8730356097221375, "num_tokens": 638593565.0, "step": 16741 }, { "epoch": 2.12975448416232, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.87371063232422, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8674293160438538, "num_tokens": 638625952.0, "step": 16742 }, { "epoch": 2.1298816944409107, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.92205810546875, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8603029847145081, "num_tokens": 638662000.0, "step": 16743 }, { "epoch": 2.130008904719501, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.8192024230957, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8490896821022034, "num_tokens": 638706020.0, "step": 16744 }, { "epoch": 2.1301361149980917, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.29425811767578, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8680248260498047, "num_tokens": 638745354.0, "step": 16745 }, { "epoch": 2.1302633252766823, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.89448928833008, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.856957733631134, "num_tokens": 638785497.0, "step": 16746 }, { "epoch": 2.130390535555273, "ewc_loss": 0.171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014972686767578125, "grad_norm": 46.52376937866211, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.842512845993042, "num_tokens": 638821650.0, "step": 16747 }, { "epoch": 2.1305177458338633, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.043697357177734, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.849219560623169, "num_tokens": 638855589.0, "step": 16748 }, { "epoch": 2.130644956112454, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.753482818603516, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8618563413619995, "num_tokens": 638895766.0, "step": 16749 }, { "epoch": 2.1307721663910444, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.50081253051758, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8527853488922119, "num_tokens": 638932622.0, "step": 16750 }, { "epoch": 2.130899376669635, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.79703903198242, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8519266843795776, "num_tokens": 638971588.0, "step": 16751 }, { "epoch": 2.1310265869482254, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.97523498535156, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8670650720596313, "num_tokens": 639009921.0, "step": 16752 }, { "epoch": 2.131153797226816, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.978004455566406, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8623135089874268, "num_tokens": 639048160.0, "step": 16753 }, { "epoch": 2.1312810075054065, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.678932189941406, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8742672204971313, "num_tokens": 639080260.0, "step": 16754 }, { "epoch": 2.131408217783997, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.515777587890625, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8545174598693848, "num_tokens": 639118758.0, "step": 16755 }, { "epoch": 2.1315354280625876, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.62935256958008, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8620047569274902, "num_tokens": 639159528.0, "step": 16756 }, { "epoch": 2.131662638341178, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.27576446533203, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8670965433120728, "num_tokens": 639200394.0, "step": 16757 }, { "epoch": 2.1317898486197686, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.005455017089844, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8542388677597046, "num_tokens": 639240938.0, "step": 16758 }, { "epoch": 2.131917058898359, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.92958450317383, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8571739792823792, "num_tokens": 639278097.0, "step": 16759 }, { "epoch": 2.1320442691769497, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.62519836425781, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8636212348937988, "num_tokens": 639324512.0, "step": 16760 }, { "epoch": 2.13217147945554, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.824283599853516, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8669173121452332, "num_tokens": 639366833.0, "step": 16761 }, { "epoch": 2.1322986897341307, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.16922378540039, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8472973704338074, "num_tokens": 639405765.0, "step": 16762 }, { "epoch": 2.132425900012721, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.01656723022461, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8571817874908447, "num_tokens": 639439622.0, "step": 16763 }, { "epoch": 2.1325531102913113, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.31865310668945, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8733261823654175, "num_tokens": 639479943.0, "step": 16764 }, { "epoch": 2.132680320569902, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.04624938964844, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.856052577495575, "num_tokens": 639520487.0, "step": 16765 }, { "epoch": 2.1328075308484924, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.19231414794922, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8609505295753479, "num_tokens": 639558997.0, "step": 16766 }, { "epoch": 2.132934741127083, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.25529098510742, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8596360683441162, "num_tokens": 639594569.0, "step": 16767 }, { "epoch": 2.1330619514056735, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.03462219238281, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8748792409896851, "num_tokens": 639629827.0, "step": 16768 }, { "epoch": 2.133189161684264, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.50613784790039, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8549540638923645, "num_tokens": 639667634.0, "step": 16769 }, { "epoch": 2.1333163719628545, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.869789123535156, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8593412637710571, "num_tokens": 639700536.0, "step": 16770 }, { "epoch": 2.133443582241445, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.97484588623047, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8644322156906128, "num_tokens": 639744006.0, "step": 16771 }, { "epoch": 2.1335707925200356, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.57252883911133, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8680565357208252, "num_tokens": 639784950.0, "step": 16772 }, { "epoch": 2.133698002798626, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.22323226928711, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.877591073513031, "num_tokens": 639823567.0, "step": 16773 }, { "epoch": 2.1338252130772166, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.659420013427734, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.872843325138092, "num_tokens": 639863386.0, "step": 16774 }, { "epoch": 2.133952423355807, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.32335662841797, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8503113389015198, "num_tokens": 639901219.0, "step": 16775 }, { "epoch": 2.1340796336343977, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.96067810058594, "learning_rate": 1e-06, "loss": 0.6502, "mean_token_accuracy": 0.8467632532119751, "num_tokens": 639935240.0, "step": 16776 }, { "epoch": 2.134206843912988, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.57145309448242, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8542202711105347, "num_tokens": 639975370.0, "step": 16777 }, { "epoch": 2.1343340541915787, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.752662658691406, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8429902791976929, "num_tokens": 640020463.0, "step": 16778 }, { "epoch": 2.1344612644701693, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.91400909423828, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8570197820663452, "num_tokens": 640057142.0, "step": 16779 }, { "epoch": 2.13458847474876, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.4684944152832, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8584462404251099, "num_tokens": 640098879.0, "step": 16780 }, { "epoch": 2.1347156850273503, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.55123519897461, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8649788498878479, "num_tokens": 640136077.0, "step": 16781 }, { "epoch": 2.134842895305941, "ewc_loss": 0.1728515625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001506805419921875, "grad_norm": 47.498130798339844, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.849073588848114, "num_tokens": 640175160.0, "step": 16782 }, { "epoch": 2.1349701055845314, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.928077697753906, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8870735168457031, "num_tokens": 640216670.0, "step": 16783 }, { "epoch": 2.135097315863122, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.35818099975586, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8580046892166138, "num_tokens": 640254872.0, "step": 16784 }, { "epoch": 2.1352245261417124, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 47.80256652832031, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8599205017089844, "num_tokens": 640289319.0, "step": 16785 }, { "epoch": 2.135351736420303, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.90153503417969, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8767644166946411, "num_tokens": 640323897.0, "step": 16786 }, { "epoch": 2.1354789466988935, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.406707763671875, "learning_rate": 1e-06, "loss": 0.554, "mean_token_accuracy": 0.8688198328018188, "num_tokens": 640366203.0, "step": 16787 }, { "epoch": 2.1356061569774836, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.12078094482422, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.872205376625061, "num_tokens": 640408089.0, "step": 16788 }, { "epoch": 2.135733367256074, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.16180419921875, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.871422290802002, "num_tokens": 640443532.0, "step": 16789 }, { "epoch": 2.1358605775346646, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.770347595214844, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.855082631111145, "num_tokens": 640480425.0, "step": 16790 }, { "epoch": 2.135987787813255, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.228057861328125, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8650223016738892, "num_tokens": 640525658.0, "step": 16791 }, { "epoch": 2.1361149980918457, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.33824157714844, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8720556497573853, "num_tokens": 640563276.0, "step": 16792 }, { "epoch": 2.136242208370436, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.31035232543945, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8542672991752625, "num_tokens": 640608150.0, "step": 16793 }, { "epoch": 2.1363694186490267, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.36785888671875, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8627197742462158, "num_tokens": 640646127.0, "step": 16794 }, { "epoch": 2.1364966289276173, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.962432861328125, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8698933720588684, "num_tokens": 640683195.0, "step": 16795 }, { "epoch": 2.136623839206208, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.24446105957031, "learning_rate": 1e-06, "loss": 0.5336, "mean_token_accuracy": 0.8771060109138489, "num_tokens": 640724102.0, "step": 16796 }, { "epoch": 2.1367510494847983, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.45461654663086, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8573333024978638, "num_tokens": 640762624.0, "step": 16797 }, { "epoch": 2.136878259763389, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.67903137207031, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8567448854446411, "num_tokens": 640799437.0, "step": 16798 }, { "epoch": 2.1370054700419794, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.661617279052734, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8675234317779541, "num_tokens": 640837115.0, "step": 16799 }, { "epoch": 2.13713268032057, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.51010513305664, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8636794090270996, "num_tokens": 640882312.0, "step": 16800 }, { "epoch": 2.1372598905991604, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.85878372192383, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8613450527191162, "num_tokens": 640921032.0, "step": 16801 }, { "epoch": 2.137387100877751, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.585731506347656, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8636013269424438, "num_tokens": 640957424.0, "step": 16802 }, { "epoch": 2.1375143111563415, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 47.06838607788086, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8542022705078125, "num_tokens": 640992985.0, "step": 16803 }, { "epoch": 2.137641521434932, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.06774139404297, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8721967935562134, "num_tokens": 641030379.0, "step": 16804 }, { "epoch": 2.1377687317135226, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.124603271484375, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8536997437477112, "num_tokens": 641068526.0, "step": 16805 }, { "epoch": 2.137895941992113, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.852577209472656, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8539593815803528, "num_tokens": 641103351.0, "step": 16806 }, { "epoch": 2.1380231522707036, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.86247634887695, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8636569976806641, "num_tokens": 641143570.0, "step": 16807 }, { "epoch": 2.138150362549294, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.24605178833008, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8665735721588135, "num_tokens": 641180625.0, "step": 16808 }, { "epoch": 2.1382775728278847, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.27640914916992, "learning_rate": 1e-06, "loss": 0.6669, "mean_token_accuracy": 0.839893639087677, "num_tokens": 641226770.0, "step": 16809 }, { "epoch": 2.138404783106475, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.64894485473633, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8740761280059814, "num_tokens": 641269682.0, "step": 16810 }, { "epoch": 2.1385319933850653, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.74204635620117, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8743780851364136, "num_tokens": 641303803.0, "step": 16811 }, { "epoch": 2.1386592036636562, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.531944274902344, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8562546968460083, "num_tokens": 641343141.0, "step": 16812 }, { "epoch": 2.1387864139422463, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.32246398925781, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8670252561569214, "num_tokens": 641384006.0, "step": 16813 }, { "epoch": 2.138913624220837, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.569114685058594, "learning_rate": 1e-06, "loss": 0.6482, "mean_token_accuracy": 0.8402435779571533, "num_tokens": 641423998.0, "step": 16814 }, { "epoch": 2.1390408344994274, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.3615837097168, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8617046475410461, "num_tokens": 641457886.0, "step": 16815 }, { "epoch": 2.139168044778018, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.88862228393555, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8629728555679321, "num_tokens": 641503386.0, "step": 16816 }, { "epoch": 2.1392952550566084, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.43915557861328, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8736392855644226, "num_tokens": 641539533.0, "step": 16817 }, { "epoch": 2.139422465335199, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.097660064697266, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8566884994506836, "num_tokens": 641574020.0, "step": 16818 }, { "epoch": 2.1395496756137895, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.079811096191406, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8676197528839111, "num_tokens": 641605206.0, "step": 16819 }, { "epoch": 2.13967688589238, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.36390686035156, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8522698283195496, "num_tokens": 641649857.0, "step": 16820 }, { "epoch": 2.1398040961709706, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.89842224121094, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8567794561386108, "num_tokens": 641683360.0, "step": 16821 }, { "epoch": 2.139931306449561, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.49475860595703, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8628257513046265, "num_tokens": 641720105.0, "step": 16822 }, { "epoch": 2.1400585167281516, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.07695007324219, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8642933964729309, "num_tokens": 641750718.0, "step": 16823 }, { "epoch": 2.140185727006742, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.31282043457031, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8595970273017883, "num_tokens": 641791620.0, "step": 16824 }, { "epoch": 2.1403129372853327, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.241214752197266, "learning_rate": 1e-06, "loss": 0.6311, "mean_token_accuracy": 0.849906325340271, "num_tokens": 641833566.0, "step": 16825 }, { "epoch": 2.140440147563923, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.845603942871094, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8694130182266235, "num_tokens": 641869080.0, "step": 16826 }, { "epoch": 2.1405673578425137, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.23350524902344, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8594136834144592, "num_tokens": 641910195.0, "step": 16827 }, { "epoch": 2.1406945681211043, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.49375915527344, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8483577966690063, "num_tokens": 641947270.0, "step": 16828 }, { "epoch": 2.140821778399695, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.309261322021484, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.863823413848877, "num_tokens": 641986472.0, "step": 16829 }, { "epoch": 2.1409489886782853, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.52494812011719, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8506025075912476, "num_tokens": 642028130.0, "step": 16830 }, { "epoch": 2.141076198956876, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.5059814453125, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.856478750705719, "num_tokens": 642067097.0, "step": 16831 }, { "epoch": 2.1412034092354664, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.610557556152344, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8736056089401245, "num_tokens": 642099628.0, "step": 16832 }, { "epoch": 2.141330619514057, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.120025634765625, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8849920034408569, "num_tokens": 642135941.0, "step": 16833 }, { "epoch": 2.1414578297926474, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.186580657958984, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8545932769775391, "num_tokens": 642169465.0, "step": 16834 }, { "epoch": 2.141585040071238, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.55500411987305, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8640127182006836, "num_tokens": 642208961.0, "step": 16835 }, { "epoch": 2.141712250349828, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.71597671508789, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.860456645488739, "num_tokens": 642249691.0, "step": 16836 }, { "epoch": 2.141839460628419, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.63228225708008, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8637599349021912, "num_tokens": 642292425.0, "step": 16837 }, { "epoch": 2.141966670907009, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.71243667602539, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8726857900619507, "num_tokens": 642330982.0, "step": 16838 }, { "epoch": 2.1420938811855996, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.04511642456055, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8625993728637695, "num_tokens": 642365414.0, "step": 16839 }, { "epoch": 2.14222109146419, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.82774353027344, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8569109439849854, "num_tokens": 642398660.0, "step": 16840 }, { "epoch": 2.1423483017427807, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.59260940551758, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8682419061660767, "num_tokens": 642432605.0, "step": 16841 }, { "epoch": 2.142475512021371, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.13996124267578, "learning_rate": 1e-06, "loss": 0.5587, "mean_token_accuracy": 0.8720117807388306, "num_tokens": 642472041.0, "step": 16842 }, { "epoch": 2.1426027222999617, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.6781005859375, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8703290224075317, "num_tokens": 642512399.0, "step": 16843 }, { "epoch": 2.1427299325785523, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.912654876708984, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8553239107131958, "num_tokens": 642550829.0, "step": 16844 }, { "epoch": 2.142857142857143, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.082374572753906, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8621910810470581, "num_tokens": 642591255.0, "step": 16845 }, { "epoch": 2.1429843531357333, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.793678283691406, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8647273778915405, "num_tokens": 642625772.0, "step": 16846 }, { "epoch": 2.143111563414324, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.27208709716797, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.854527473449707, "num_tokens": 642667596.0, "step": 16847 }, { "epoch": 2.1432387736929144, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.125389099121094, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8630176782608032, "num_tokens": 642702159.0, "step": 16848 }, { "epoch": 2.143365983971505, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.027183532714844, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8765724897384644, "num_tokens": 642737307.0, "step": 16849 }, { "epoch": 2.1434931942500954, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.33470916748047, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8523865342140198, "num_tokens": 642776660.0, "step": 16850 }, { "epoch": 2.143620404528686, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.990692138671875, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8555881977081299, "num_tokens": 642814131.0, "step": 16851 }, { "epoch": 2.1437476148072765, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.35348892211914, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8432883620262146, "num_tokens": 642854138.0, "step": 16852 }, { "epoch": 2.143874825085867, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.88575744628906, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8688919544219971, "num_tokens": 642893767.0, "step": 16853 }, { "epoch": 2.1440020353644575, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.451507568359375, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8781884908676147, "num_tokens": 642928286.0, "step": 16854 }, { "epoch": 2.144129245643048, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.30648422241211, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8701231479644775, "num_tokens": 642971220.0, "step": 16855 }, { "epoch": 2.1442564559216386, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.134437561035156, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8662173748016357, "num_tokens": 643013618.0, "step": 16856 }, { "epoch": 2.144383666200229, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.227760314941406, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8652104139328003, "num_tokens": 643058122.0, "step": 16857 }, { "epoch": 2.1445108764788197, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.829345703125, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8494173288345337, "num_tokens": 643096662.0, "step": 16858 }, { "epoch": 2.14463808675741, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.1722297668457, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8762266635894775, "num_tokens": 643137923.0, "step": 16859 }, { "epoch": 2.1447652970360007, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.95283889770508, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.868588924407959, "num_tokens": 643176918.0, "step": 16860 }, { "epoch": 2.144892507314591, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.30232238769531, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8805575370788574, "num_tokens": 643217525.0, "step": 16861 }, { "epoch": 2.1450197175931813, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.8509635925293, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8805795907974243, "num_tokens": 643255131.0, "step": 16862 }, { "epoch": 2.145146927871772, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7645149230957, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8714554309844971, "num_tokens": 643292092.0, "step": 16863 }, { "epoch": 2.1452741381503624, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.56329345703125, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8589702844619751, "num_tokens": 643330510.0, "step": 16864 }, { "epoch": 2.145401348428953, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.21369171142578, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8579383492469788, "num_tokens": 643372900.0, "step": 16865 }, { "epoch": 2.1455285587075434, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.02664566040039, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8702685832977295, "num_tokens": 643412866.0, "step": 16866 }, { "epoch": 2.145655768986134, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.24201202392578, "learning_rate": 1e-06, "loss": 0.6272, "mean_token_accuracy": 0.8461522459983826, "num_tokens": 643452846.0, "step": 16867 }, { "epoch": 2.1457829792647245, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.93792724609375, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8695454597473145, "num_tokens": 643488264.0, "step": 16868 }, { "epoch": 2.145910189543315, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.401119232177734, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8504048585891724, "num_tokens": 643528131.0, "step": 16869 }, { "epoch": 2.1460373998219056, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.84965133666992, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8629743456840515, "num_tokens": 643564041.0, "step": 16870 }, { "epoch": 2.146164610100496, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.366390228271484, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8507025837898254, "num_tokens": 643601507.0, "step": 16871 }, { "epoch": 2.1462918203790866, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.30979537963867, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8478464484214783, "num_tokens": 643647161.0, "step": 16872 }, { "epoch": 2.146419030657677, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.29018783569336, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8686439990997314, "num_tokens": 643687114.0, "step": 16873 }, { "epoch": 2.1465462409362677, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.894962310791016, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8483477830886841, "num_tokens": 643722110.0, "step": 16874 }, { "epoch": 2.146673451214858, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.55047607421875, "learning_rate": 1e-06, "loss": 0.6663, "mean_token_accuracy": 0.8403133153915405, "num_tokens": 643765770.0, "step": 16875 }, { "epoch": 2.1468006614934487, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.74927520751953, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8534166812896729, "num_tokens": 643810097.0, "step": 16876 }, { "epoch": 2.1469278717720393, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.994625091552734, "learning_rate": 1e-06, "loss": 0.6648, "mean_token_accuracy": 0.8364774584770203, "num_tokens": 643847904.0, "step": 16877 }, { "epoch": 2.14705508205063, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.77781677246094, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8703811764717102, "num_tokens": 643889157.0, "step": 16878 }, { "epoch": 2.1471822923292203, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.6594123840332, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8656298518180847, "num_tokens": 643931181.0, "step": 16879 }, { "epoch": 2.147309502607811, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.03084945678711, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8620203733444214, "num_tokens": 643965415.0, "step": 16880 }, { "epoch": 2.1474367128864014, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.250125885009766, "learning_rate": 1e-06, "loss": 0.6424, "mean_token_accuracy": 0.8479598760604858, "num_tokens": 644010647.0, "step": 16881 }, { "epoch": 2.147563923164992, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.930301666259766, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8799310922622681, "num_tokens": 644044347.0, "step": 16882 }, { "epoch": 2.1476911334435824, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.54179382324219, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8704319000244141, "num_tokens": 644083456.0, "step": 16883 }, { "epoch": 2.147818343722173, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.39484405517578, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8540161848068237, "num_tokens": 644123851.0, "step": 16884 }, { "epoch": 2.1479455540007635, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.12518310546875, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8730054497718811, "num_tokens": 644168880.0, "step": 16885 }, { "epoch": 2.1480727642793536, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.88534164428711, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8644686937332153, "num_tokens": 644204532.0, "step": 16886 }, { "epoch": 2.148199974557944, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.94168472290039, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.881374180316925, "num_tokens": 644246160.0, "step": 16887 }, { "epoch": 2.1483271848365346, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.68598556518555, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8764106035232544, "num_tokens": 644284881.0, "step": 16888 }, { "epoch": 2.148454395115125, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.403499603271484, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8559080362319946, "num_tokens": 644326661.0, "step": 16889 }, { "epoch": 2.1485816053937157, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.262229919433594, "learning_rate": 1e-06, "loss": 0.5656, "mean_token_accuracy": 0.8672536611557007, "num_tokens": 644364420.0, "step": 16890 }, { "epoch": 2.148708815672306, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.810279846191406, "learning_rate": 1e-06, "loss": 0.6435, "mean_token_accuracy": 0.8447001576423645, "num_tokens": 644398626.0, "step": 16891 }, { "epoch": 2.1488360259508967, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.83570098876953, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.861735463142395, "num_tokens": 644434772.0, "step": 16892 }, { "epoch": 2.1489632362294873, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.69746780395508, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8597115278244019, "num_tokens": 644474425.0, "step": 16893 }, { "epoch": 2.149090446508078, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.954132080078125, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8546557426452637, "num_tokens": 644507796.0, "step": 16894 }, { "epoch": 2.1492176567866683, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.59608840942383, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8582144379615784, "num_tokens": 644544172.0, "step": 16895 }, { "epoch": 2.149344867065259, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.70751953125, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8608176708221436, "num_tokens": 644582217.0, "step": 16896 }, { "epoch": 2.1494720773438494, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.353641510009766, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8697065114974976, "num_tokens": 644620028.0, "step": 16897 }, { "epoch": 2.14959928762244, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.87712097167969, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8457090854644775, "num_tokens": 644663743.0, "step": 16898 }, { "epoch": 2.1497264979010304, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 46.962955474853516, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8650436401367188, "num_tokens": 644700914.0, "step": 16899 }, { "epoch": 2.149853708179621, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.38196563720703, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8592044115066528, "num_tokens": 644744232.0, "step": 16900 }, { "epoch": 2.1499809184582115, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.90024948120117, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8552383780479431, "num_tokens": 644780851.0, "step": 16901 }, { "epoch": 2.150108128736802, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.38117599487305, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8703247904777527, "num_tokens": 644822103.0, "step": 16902 }, { "epoch": 2.1502353390153925, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.715484619140625, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8595705628395081, "num_tokens": 644861209.0, "step": 16903 }, { "epoch": 2.150362549293983, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.49513626098633, "learning_rate": 1e-06, "loss": 0.6414, "mean_token_accuracy": 0.8483731746673584, "num_tokens": 644899860.0, "step": 16904 }, { "epoch": 2.1504897595725736, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.32389831542969, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8571866750717163, "num_tokens": 644943101.0, "step": 16905 }, { "epoch": 2.150616969851164, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.92897033691406, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8493227958679199, "num_tokens": 644981786.0, "step": 16906 }, { "epoch": 2.1507441801297547, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.18639373779297, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8634567856788635, "num_tokens": 645017949.0, "step": 16907 }, { "epoch": 2.150871390408345, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.855525970458984, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8607251048088074, "num_tokens": 645053443.0, "step": 16908 }, { "epoch": 2.1509986006869353, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.20962905883789, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8635126352310181, "num_tokens": 645089916.0, "step": 16909 }, { "epoch": 2.1511258109655262, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.69218444824219, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8455474376678467, "num_tokens": 645134531.0, "step": 16910 }, { "epoch": 2.1512530212441163, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.30717086791992, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8560234308242798, "num_tokens": 645172608.0, "step": 16911 }, { "epoch": 2.151380231522707, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.056827545166016, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8572280406951904, "num_tokens": 645209895.0, "step": 16912 }, { "epoch": 2.1515074418012974, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.085941314697266, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.867141604423523, "num_tokens": 645245259.0, "step": 16913 }, { "epoch": 2.151634652079888, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.202903747558594, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8572973012924194, "num_tokens": 645284962.0, "step": 16914 }, { "epoch": 2.1517618623584784, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.1691780090332, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8757545948028564, "num_tokens": 645322352.0, "step": 16915 }, { "epoch": 2.151889072637069, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.84822463989258, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8616988658905029, "num_tokens": 645358401.0, "step": 16916 }, { "epoch": 2.1520162829156595, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.691856384277344, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.874413251876831, "num_tokens": 645399147.0, "step": 16917 }, { "epoch": 2.15214349319425, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.76866912841797, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8424903154373169, "num_tokens": 645437286.0, "step": 16918 }, { "epoch": 2.1522707034728406, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.91888427734375, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8552911281585693, "num_tokens": 645477853.0, "step": 16919 }, { "epoch": 2.152397913751431, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.99941635131836, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8513534069061279, "num_tokens": 645520476.0, "step": 16920 }, { "epoch": 2.1525251240300216, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.26338577270508, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8475888967514038, "num_tokens": 645561539.0, "step": 16921 }, { "epoch": 2.152652334308612, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.75126647949219, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8562377691268921, "num_tokens": 645602950.0, "step": 16922 }, { "epoch": 2.1527795445872027, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.940765380859375, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8536792993545532, "num_tokens": 645639215.0, "step": 16923 }, { "epoch": 2.152906754865793, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.7821044921875, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8595268726348877, "num_tokens": 645677287.0, "step": 16924 }, { "epoch": 2.1530339651443837, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.774784088134766, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8701609373092651, "num_tokens": 645710784.0, "step": 16925 }, { "epoch": 2.1531611754229742, "ewc_loss": 0.1748046875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000152587890625, "grad_norm": 46.62875747680664, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8575129508972168, "num_tokens": 645750568.0, "step": 16926 }, { "epoch": 2.1532883857015648, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.37729263305664, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8664777874946594, "num_tokens": 645783911.0, "step": 16927 }, { "epoch": 2.1534155959801553, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.422794342041016, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8784730434417725, "num_tokens": 645822524.0, "step": 16928 }, { "epoch": 2.153542806258746, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 49.034400939941406, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8561979532241821, "num_tokens": 645856924.0, "step": 16929 }, { "epoch": 2.1536700165373364, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.27739715576172, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8552227020263672, "num_tokens": 645896995.0, "step": 16930 }, { "epoch": 2.153797226815927, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.29232406616211, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8672951459884644, "num_tokens": 645941768.0, "step": 16931 }, { "epoch": 2.1539244370945174, "ewc_loss": 0.173828125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015163421630859375, "grad_norm": 46.84164810180664, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8593548536300659, "num_tokens": 645981909.0, "step": 16932 }, { "epoch": 2.154051647373108, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.39366912841797, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8515965938568115, "num_tokens": 646022034.0, "step": 16933 }, { "epoch": 2.154178857651698, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.593807220458984, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8588283658027649, "num_tokens": 646056714.0, "step": 16934 }, { "epoch": 2.1543060679302886, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.581871032714844, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8529735207557678, "num_tokens": 646095668.0, "step": 16935 }, { "epoch": 2.154433278208879, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.670780181884766, "learning_rate": 1e-06, "loss": 0.6489, "mean_token_accuracy": 0.845705509185791, "num_tokens": 646136745.0, "step": 16936 }, { "epoch": 2.1545604884874696, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.62758255004883, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8585509061813354, "num_tokens": 646180314.0, "step": 16937 }, { "epoch": 2.15468769876606, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.62977600097656, "learning_rate": 1e-06, "loss": 0.5765, "mean_token_accuracy": 0.863179087638855, "num_tokens": 646218191.0, "step": 16938 }, { "epoch": 2.1548149090446507, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.234466552734375, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8767960071563721, "num_tokens": 646256350.0, "step": 16939 }, { "epoch": 2.154942119323241, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.226470947265625, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8835223317146301, "num_tokens": 646290915.0, "step": 16940 }, { "epoch": 2.1550693296018317, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.32557678222656, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8549294471740723, "num_tokens": 646325838.0, "step": 16941 }, { "epoch": 2.1551965398804223, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.98139572143555, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8412977457046509, "num_tokens": 646362742.0, "step": 16942 }, { "epoch": 2.155323750159013, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.593379974365234, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8472157716751099, "num_tokens": 646403958.0, "step": 16943 }, { "epoch": 2.1554509604376033, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.14387893676758, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8543341159820557, "num_tokens": 646442872.0, "step": 16944 }, { "epoch": 2.155578170716194, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.61411666870117, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8674091100692749, "num_tokens": 646484642.0, "step": 16945 }, { "epoch": 2.1557053809947844, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.609657287597656, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8701184391975403, "num_tokens": 646521688.0, "step": 16946 }, { "epoch": 2.155832591273375, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.64271926879883, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.869176983833313, "num_tokens": 646560182.0, "step": 16947 }, { "epoch": 2.1559598015519654, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.41353988647461, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8651713132858276, "num_tokens": 646593252.0, "step": 16948 }, { "epoch": 2.156087011830556, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.27931213378906, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8601936101913452, "num_tokens": 646635514.0, "step": 16949 }, { "epoch": 2.1562142221091465, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.581363677978516, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.84868323802948, "num_tokens": 646674785.0, "step": 16950 }, { "epoch": 2.156341432387737, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.373966217041016, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8615885972976685, "num_tokens": 646713832.0, "step": 16951 }, { "epoch": 2.1564686426663275, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.53451156616211, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8643720149993896, "num_tokens": 646752218.0, "step": 16952 }, { "epoch": 2.156595852944918, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.04057312011719, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8603123426437378, "num_tokens": 646786196.0, "step": 16953 }, { "epoch": 2.1567230632235086, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.7160758972168, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8534159064292908, "num_tokens": 646821462.0, "step": 16954 }, { "epoch": 2.156850273502099, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.35396194458008, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8695663213729858, "num_tokens": 646856613.0, "step": 16955 }, { "epoch": 2.1569774837806897, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.763092041015625, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8733232617378235, "num_tokens": 646893013.0, "step": 16956 }, { "epoch": 2.15710469405928, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.26345443725586, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8664588928222656, "num_tokens": 646936180.0, "step": 16957 }, { "epoch": 2.1572319043378707, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.41064453125, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.861257016658783, "num_tokens": 646979314.0, "step": 16958 }, { "epoch": 2.157359114616461, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.01597213745117, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8724302053451538, "num_tokens": 647020136.0, "step": 16959 }, { "epoch": 2.1574863248950513, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.447052001953125, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8661949038505554, "num_tokens": 647052970.0, "step": 16960 }, { "epoch": 2.157613535173642, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.365142822265625, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8557264804840088, "num_tokens": 647093195.0, "step": 16961 }, { "epoch": 2.1577407454522324, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.229793548583984, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8621186017990112, "num_tokens": 647133800.0, "step": 16962 }, { "epoch": 2.157867955730823, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.58627700805664, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.845974862575531, "num_tokens": 647167704.0, "step": 16963 }, { "epoch": 2.1579951660094134, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.48400115966797, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8482143878936768, "num_tokens": 647208474.0, "step": 16964 }, { "epoch": 2.158122376288004, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.72969436645508, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8748480677604675, "num_tokens": 647242131.0, "step": 16965 }, { "epoch": 2.1582495865665945, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.80888748168945, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8416557312011719, "num_tokens": 647280952.0, "step": 16966 }, { "epoch": 2.158376796845185, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.94998550415039, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8572248220443726, "num_tokens": 647318411.0, "step": 16967 }, { "epoch": 2.1585040071237755, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.19807052612305, "learning_rate": 1e-06, "loss": 0.6498, "mean_token_accuracy": 0.8477894067764282, "num_tokens": 647361002.0, "step": 16968 }, { "epoch": 2.158631217402366, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.969783782958984, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8640097379684448, "num_tokens": 647402995.0, "step": 16969 }, { "epoch": 2.1587584276809566, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.9389533996582, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8635241985321045, "num_tokens": 647441220.0, "step": 16970 }, { "epoch": 2.158885637959547, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.48497009277344, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8537282347679138, "num_tokens": 647476832.0, "step": 16971 }, { "epoch": 2.1590128482381377, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.86619186401367, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8642823100090027, "num_tokens": 647516855.0, "step": 16972 }, { "epoch": 2.159140058516728, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.63540267944336, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8582178354263306, "num_tokens": 647556913.0, "step": 16973 }, { "epoch": 2.1592672687953187, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.07726287841797, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8782517910003662, "num_tokens": 647591242.0, "step": 16974 }, { "epoch": 2.1593944790739092, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.91847610473633, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8654358386993408, "num_tokens": 647632156.0, "step": 16975 }, { "epoch": 2.1595216893524998, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.11420822143555, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8676510453224182, "num_tokens": 647669857.0, "step": 16976 }, { "epoch": 2.1596488996310903, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.836795806884766, "learning_rate": 1e-06, "loss": 0.6296, "mean_token_accuracy": 0.8497426509857178, "num_tokens": 647715580.0, "step": 16977 }, { "epoch": 2.159776109909681, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.23693084716797, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8530048131942749, "num_tokens": 647755917.0, "step": 16978 }, { "epoch": 2.1599033201882714, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.971656799316406, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8766184449195862, "num_tokens": 647802205.0, "step": 16979 }, { "epoch": 2.160030530466862, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.881778717041016, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8481055498123169, "num_tokens": 647846627.0, "step": 16980 }, { "epoch": 2.1601577407454524, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.89408874511719, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8533587455749512, "num_tokens": 647885033.0, "step": 16981 }, { "epoch": 2.160284951024043, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7213020324707, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8518292903900146, "num_tokens": 647923370.0, "step": 16982 }, { "epoch": 2.1604121613026335, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.929290771484375, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8677067756652832, "num_tokens": 647963431.0, "step": 16983 }, { "epoch": 2.1605393715812236, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.9036750793457, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8530091047286987, "num_tokens": 648002854.0, "step": 16984 }, { "epoch": 2.160666581859814, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.2806282043457, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8469855785369873, "num_tokens": 648041082.0, "step": 16985 }, { "epoch": 2.1607937921384046, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.8072509765625, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8515751361846924, "num_tokens": 648082109.0, "step": 16986 }, { "epoch": 2.160921002416995, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.09662628173828, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.875438392162323, "num_tokens": 648117398.0, "step": 16987 }, { "epoch": 2.1610482126955857, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.70867156982422, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8696439266204834, "num_tokens": 648158997.0, "step": 16988 }, { "epoch": 2.161175422974176, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.078372955322266, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8594042062759399, "num_tokens": 648194334.0, "step": 16989 }, { "epoch": 2.1613026332527667, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.19263458251953, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8571178913116455, "num_tokens": 648230601.0, "step": 16990 }, { "epoch": 2.1614298435313573, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.22825622558594, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8564333319664001, "num_tokens": 648274979.0, "step": 16991 }, { "epoch": 2.161557053809948, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.9565315246582, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8455832600593567, "num_tokens": 648319229.0, "step": 16992 }, { "epoch": 2.1616842640885383, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.372501373291016, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8750336766242981, "num_tokens": 648354550.0, "step": 16993 }, { "epoch": 2.161811474367129, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.523162841796875, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.8732222318649292, "num_tokens": 648389820.0, "step": 16994 }, { "epoch": 2.1619386846457194, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.93968963623047, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8764342665672302, "num_tokens": 648423841.0, "step": 16995 }, { "epoch": 2.16206589492431, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.2730712890625, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8499144315719604, "num_tokens": 648459865.0, "step": 16996 }, { "epoch": 2.1621931052029004, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.33906173706055, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8808066844940186, "num_tokens": 648492639.0, "step": 16997 }, { "epoch": 2.162320315481491, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.122222900390625, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8672349452972412, "num_tokens": 648533456.0, "step": 16998 }, { "epoch": 2.1624475257600815, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.2353515625, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8756194710731506, "num_tokens": 648571955.0, "step": 16999 }, { "epoch": 2.162574736038672, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.58769607543945, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8769888877868652, "num_tokens": 648614284.0, "step": 17000 }, { "epoch": 2.1627019463172625, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.16484832763672, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8708845376968384, "num_tokens": 648654399.0, "step": 17001 }, { "epoch": 2.162829156595853, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.15642547607422, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8668748140335083, "num_tokens": 648694490.0, "step": 17002 }, { "epoch": 2.1629563668744436, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.13561248779297, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8666273951530457, "num_tokens": 648730665.0, "step": 17003 }, { "epoch": 2.163083577153034, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.48085021972656, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8653386831283569, "num_tokens": 648765289.0, "step": 17004 }, { "epoch": 2.1632107874316246, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.930545806884766, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8618689775466919, "num_tokens": 648803982.0, "step": 17005 }, { "epoch": 2.163337997710215, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.625946044921875, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8674420118331909, "num_tokens": 648840992.0, "step": 17006 }, { "epoch": 2.1634652079888053, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.43852233886719, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8586657047271729, "num_tokens": 648883621.0, "step": 17007 }, { "epoch": 2.1635924182673962, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.674747467041016, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.848171591758728, "num_tokens": 648922258.0, "step": 17008 }, { "epoch": 2.1637196285459863, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.33201599121094, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8624109029769897, "num_tokens": 648965010.0, "step": 17009 }, { "epoch": 2.163846838824577, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.8107795715332, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8567357063293457, "num_tokens": 649008124.0, "step": 17010 }, { "epoch": 2.1639740491031674, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.50551223754883, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8619664311408997, "num_tokens": 649044743.0, "step": 17011 }, { "epoch": 2.164101259381758, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.603275299072266, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8752801418304443, "num_tokens": 649078410.0, "step": 17012 }, { "epoch": 2.1642284696603484, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.493160247802734, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8720191121101379, "num_tokens": 649115415.0, "step": 17013 }, { "epoch": 2.164355679938939, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.54306411743164, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8594039678573608, "num_tokens": 649156266.0, "step": 17014 }, { "epoch": 2.1644828902175295, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.755367279052734, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8675312995910645, "num_tokens": 649192552.0, "step": 17015 }, { "epoch": 2.16461010049612, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.35896682739258, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8641855716705322, "num_tokens": 649224163.0, "step": 17016 }, { "epoch": 2.1647373107747105, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.81669616699219, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8830384612083435, "num_tokens": 649259348.0, "step": 17017 }, { "epoch": 2.164864521053301, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.67125701904297, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8777602314949036, "num_tokens": 649296875.0, "step": 17018 }, { "epoch": 2.1649917313318916, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.26864242553711, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8481926918029785, "num_tokens": 649333455.0, "step": 17019 }, { "epoch": 2.165118941610482, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.69200897216797, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8627299070358276, "num_tokens": 649364567.0, "step": 17020 }, { "epoch": 2.1652461518890727, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.3882942199707, "learning_rate": 1e-06, "loss": 0.6557, "mean_token_accuracy": 0.8439496755599976, "num_tokens": 649400579.0, "step": 17021 }, { "epoch": 2.165373362167663, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.030033111572266, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8740108013153076, "num_tokens": 649432940.0, "step": 17022 }, { "epoch": 2.1655005724462537, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.25421142578125, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8671693801879883, "num_tokens": 649466695.0, "step": 17023 }, { "epoch": 2.1656277827248442, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.09937286376953, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.859320878982544, "num_tokens": 649504578.0, "step": 17024 }, { "epoch": 2.1657549930034348, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.86787796020508, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8515726327896118, "num_tokens": 649541536.0, "step": 17025 }, { "epoch": 2.1658822032820253, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.27463150024414, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8618813753128052, "num_tokens": 649581280.0, "step": 17026 }, { "epoch": 2.166009413560616, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.56863784790039, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8840354681015015, "num_tokens": 649619837.0, "step": 17027 }, { "epoch": 2.1661366238392064, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.25281524658203, "learning_rate": 1e-06, "loss": 0.5463, "mean_token_accuracy": 0.8739140629768372, "num_tokens": 649654708.0, "step": 17028 }, { "epoch": 2.166263834117797, "ewc_loss": 0.17578125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 46.8528938293457, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.840583086013794, "num_tokens": 649691709.0, "step": 17029 }, { "epoch": 2.1663910443963874, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.796756744384766, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8580556511878967, "num_tokens": 649734876.0, "step": 17030 }, { "epoch": 2.166518254674978, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.73579406738281, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.870893120765686, "num_tokens": 649775477.0, "step": 17031 }, { "epoch": 2.166645464953568, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.973567962646484, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8736815452575684, "num_tokens": 649816136.0, "step": 17032 }, { "epoch": 2.1667726752321586, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.1281852722168, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8711814880371094, "num_tokens": 649847942.0, "step": 17033 }, { "epoch": 2.166899885510749, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.716827392578125, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8586703538894653, "num_tokens": 649887875.0, "step": 17034 }, { "epoch": 2.1670270957893396, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.573028564453125, "learning_rate": 1e-06, "loss": 0.6336, "mean_token_accuracy": 0.8497129678726196, "num_tokens": 649920557.0, "step": 17035 }, { "epoch": 2.16715430606793, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.0459098815918, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8708893060684204, "num_tokens": 649959704.0, "step": 17036 }, { "epoch": 2.1672815163465207, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.810890197753906, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8668531775474548, "num_tokens": 650000433.0, "step": 17037 }, { "epoch": 2.167408726625111, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.34947204589844, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8756444454193115, "num_tokens": 650038793.0, "step": 17038 }, { "epoch": 2.1675359369037017, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.72184753417969, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.872880220413208, "num_tokens": 650072877.0, "step": 17039 }, { "epoch": 2.1676631471822922, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.481361389160156, "learning_rate": 1e-06, "loss": 0.6239, "mean_token_accuracy": 0.8500222563743591, "num_tokens": 650109245.0, "step": 17040 }, { "epoch": 2.1677903574608828, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.14815902709961, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.875194251537323, "num_tokens": 650150254.0, "step": 17041 }, { "epoch": 2.1679175677394733, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.15212631225586, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8415740728378296, "num_tokens": 650189587.0, "step": 17042 }, { "epoch": 2.168044778018064, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.15945053100586, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8590230345726013, "num_tokens": 650225443.0, "step": 17043 }, { "epoch": 2.1681719882966544, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.94406509399414, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8589303493499756, "num_tokens": 650262319.0, "step": 17044 }, { "epoch": 2.168299198575245, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.804080963134766, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8444420099258423, "num_tokens": 650297137.0, "step": 17045 }, { "epoch": 2.1684264088538354, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.28959655761719, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8590765595436096, "num_tokens": 650335521.0, "step": 17046 }, { "epoch": 2.168553619132426, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.54666519165039, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8564545512199402, "num_tokens": 650373050.0, "step": 17047 }, { "epoch": 2.1686808294110165, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.333011627197266, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8679388761520386, "num_tokens": 650407438.0, "step": 17048 }, { "epoch": 2.168808039689607, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.468353271484375, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8578758239746094, "num_tokens": 650447793.0, "step": 17049 }, { "epoch": 2.1689352499681975, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.95359420776367, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8583501577377319, "num_tokens": 650487167.0, "step": 17050 }, { "epoch": 2.169062460246788, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.76025390625, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.848690390586853, "num_tokens": 650522960.0, "step": 17051 }, { "epoch": 2.1691896705253786, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.11824417114258, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8557223081588745, "num_tokens": 650564161.0, "step": 17052 }, { "epoch": 2.169316880803969, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.47446823120117, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.867896318435669, "num_tokens": 650599201.0, "step": 17053 }, { "epoch": 2.1694440910825596, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.109825134277344, "learning_rate": 1e-06, "loss": 0.6386, "mean_token_accuracy": 0.8438234925270081, "num_tokens": 650637385.0, "step": 17054 }, { "epoch": 2.16957130136115, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.71950149536133, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8630410432815552, "num_tokens": 650671954.0, "step": 17055 }, { "epoch": 2.1696985116397407, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.29152297973633, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8793664574623108, "num_tokens": 650708475.0, "step": 17056 }, { "epoch": 2.169825721918331, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.57482147216797, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8662876486778259, "num_tokens": 650748089.0, "step": 17057 }, { "epoch": 2.1699529321969213, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.58101272583008, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8634589910507202, "num_tokens": 650790869.0, "step": 17058 }, { "epoch": 2.170080142475512, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.27273178100586, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8483548164367676, "num_tokens": 650830697.0, "step": 17059 }, { "epoch": 2.1702073527541024, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.4149169921875, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8710744380950928, "num_tokens": 650868755.0, "step": 17060 }, { "epoch": 2.170334563032693, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.335662841796875, "learning_rate": 1e-06, "loss": 0.5534, "mean_token_accuracy": 0.872465968132019, "num_tokens": 650908019.0, "step": 17061 }, { "epoch": 2.1704617733112834, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.74973678588867, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8609869480133057, "num_tokens": 650954195.0, "step": 17062 }, { "epoch": 2.170588983589874, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.0002326965332, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8769237995147705, "num_tokens": 650992740.0, "step": 17063 }, { "epoch": 2.1707161938684645, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.81373977661133, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8762649297714233, "num_tokens": 651031524.0, "step": 17064 }, { "epoch": 2.170843404147055, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.85587692260742, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8609684705734253, "num_tokens": 651070004.0, "step": 17065 }, { "epoch": 2.1709706144256455, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.93682861328125, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8479869365692139, "num_tokens": 651104474.0, "step": 17066 }, { "epoch": 2.171097824704236, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.98268127441406, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.854033350944519, "num_tokens": 651148913.0, "step": 17067 }, { "epoch": 2.1712250349828266, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.83396911621094, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8710615634918213, "num_tokens": 651192072.0, "step": 17068 }, { "epoch": 2.171352245261417, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.509342193603516, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.854423463344574, "num_tokens": 651226132.0, "step": 17069 }, { "epoch": 2.1714794555400077, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.18400573730469, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8633307218551636, "num_tokens": 651263467.0, "step": 17070 }, { "epoch": 2.171606665818598, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.89383316040039, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8524729013442993, "num_tokens": 651309521.0, "step": 17071 }, { "epoch": 2.1717338760971887, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.23954772949219, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8636307716369629, "num_tokens": 651347026.0, "step": 17072 }, { "epoch": 2.1718610863757792, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.43193435668945, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8710424900054932, "num_tokens": 651380545.0, "step": 17073 }, { "epoch": 2.1719882966543698, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.67721176147461, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8674986362457275, "num_tokens": 651409428.0, "step": 17074 }, { "epoch": 2.1721155069329603, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.31542205810547, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.8504562377929688, "num_tokens": 651447993.0, "step": 17075 }, { "epoch": 2.172242717211551, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.465667724609375, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8565422296524048, "num_tokens": 651488210.0, "step": 17076 }, { "epoch": 2.1723699274901414, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.166378021240234, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8569120168685913, "num_tokens": 651525742.0, "step": 17077 }, { "epoch": 2.172497137768732, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.45314407348633, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.868022084236145, "num_tokens": 651565569.0, "step": 17078 }, { "epoch": 2.1726243480473224, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.83357620239258, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8558832406997681, "num_tokens": 651605291.0, "step": 17079 }, { "epoch": 2.172751558325913, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 48.15185546875, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8622389435768127, "num_tokens": 651641998.0, "step": 17080 }, { "epoch": 2.1728787686045035, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.11467742919922, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8601000308990479, "num_tokens": 651683040.0, "step": 17081 }, { "epoch": 2.1730059788830935, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.30007553100586, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8710424900054932, "num_tokens": 651718067.0, "step": 17082 }, { "epoch": 2.173133189161684, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.20772933959961, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.878021240234375, "num_tokens": 651761995.0, "step": 17083 }, { "epoch": 2.1732603994402746, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.81199645996094, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8769117593765259, "num_tokens": 651797780.0, "step": 17084 }, { "epoch": 2.173387609718865, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.76451873779297, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8492547869682312, "num_tokens": 651832931.0, "step": 17085 }, { "epoch": 2.1735148199974557, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.32368087768555, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8559836149215698, "num_tokens": 651868440.0, "step": 17086 }, { "epoch": 2.173642030276046, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.51487350463867, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8644264936447144, "num_tokens": 651900029.0, "step": 17087 }, { "epoch": 2.1737692405546367, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 47.094879150390625, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8580891489982605, "num_tokens": 651944892.0, "step": 17088 }, { "epoch": 2.1738964508332272, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.27744674682617, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8699540495872498, "num_tokens": 651979330.0, "step": 17089 }, { "epoch": 2.1740236611118178, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.73845291137695, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8538346290588379, "num_tokens": 652017389.0, "step": 17090 }, { "epoch": 2.1741508713904083, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 48.232994079589844, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8591755628585815, "num_tokens": 652052260.0, "step": 17091 }, { "epoch": 2.174278081668999, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.30108642578125, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8475379943847656, "num_tokens": 652089917.0, "step": 17092 }, { "epoch": 2.1744052919475894, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.79818344116211, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.866276204586029, "num_tokens": 652129849.0, "step": 17093 }, { "epoch": 2.17453250222618, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.333961486816406, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8528507351875305, "num_tokens": 652163178.0, "step": 17094 }, { "epoch": 2.1746597125047704, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.9682502746582, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8701236248016357, "num_tokens": 652201521.0, "step": 17095 }, { "epoch": 2.174786922783361, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.253318786621094, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8619633316993713, "num_tokens": 652234820.0, "step": 17096 }, { "epoch": 2.1749141330619515, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.848175048828125, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8605155944824219, "num_tokens": 652275686.0, "step": 17097 }, { "epoch": 2.175041343340542, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.24661636352539, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8532610535621643, "num_tokens": 652313243.0, "step": 17098 }, { "epoch": 2.1751685536191325, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.925804138183594, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8780554533004761, "num_tokens": 652351550.0, "step": 17099 }, { "epoch": 2.175295763897723, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.28810501098633, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8593650460243225, "num_tokens": 652399834.0, "step": 17100 }, { "epoch": 2.1754229741763136, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.767024993896484, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.8481066226959229, "num_tokens": 652435839.0, "step": 17101 }, { "epoch": 2.175550184454904, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.853431701660156, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8575290441513062, "num_tokens": 652475747.0, "step": 17102 }, { "epoch": 2.1756773947334946, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.398834228515625, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.874194860458374, "num_tokens": 652517147.0, "step": 17103 }, { "epoch": 2.175804605012085, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.95009231567383, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8737578392028809, "num_tokens": 652552811.0, "step": 17104 }, { "epoch": 2.1759318152906753, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.09182357788086, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8676431179046631, "num_tokens": 652587709.0, "step": 17105 }, { "epoch": 2.1760590255692662, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.88581848144531, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8586874008178711, "num_tokens": 652626213.0, "step": 17106 }, { "epoch": 2.1761862358478563, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.86030960083008, "learning_rate": 1e-06, "loss": 0.6849, "mean_token_accuracy": 0.8337411880493164, "num_tokens": 652665692.0, "step": 17107 }, { "epoch": 2.176313446126447, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.12004470825195, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8537790775299072, "num_tokens": 652705515.0, "step": 17108 }, { "epoch": 2.1764406564050374, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.629920959472656, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.8732165098190308, "num_tokens": 652743343.0, "step": 17109 }, { "epoch": 2.176567866683628, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.69110870361328, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8557566404342651, "num_tokens": 652784587.0, "step": 17110 }, { "epoch": 2.1766950769622184, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.774051666259766, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8538791537284851, "num_tokens": 652821985.0, "step": 17111 }, { "epoch": 2.176822287240809, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.11289978027344, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8730648756027222, "num_tokens": 652855772.0, "step": 17112 }, { "epoch": 2.1769494975193995, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.58592987060547, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8569120764732361, "num_tokens": 652890497.0, "step": 17113 }, { "epoch": 2.17707670779799, "ewc_loss": 0.1875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.84078598022461, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8716986179351807, "num_tokens": 652931611.0, "step": 17114 }, { "epoch": 2.1772039180765805, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.78181838989258, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8628820180892944, "num_tokens": 652972129.0, "step": 17115 }, { "epoch": 2.177331128355171, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.8323860168457, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8545467853546143, "num_tokens": 653009848.0, "step": 17116 }, { "epoch": 2.1774583386337616, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.71954345703125, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8603003025054932, "num_tokens": 653044614.0, "step": 17117 }, { "epoch": 2.177585548912352, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.14892578125, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8766364455223083, "num_tokens": 653087614.0, "step": 17118 }, { "epoch": 2.1777127591909426, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.42599105834961, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8756173253059387, "num_tokens": 653120434.0, "step": 17119 }, { "epoch": 2.177839969469533, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.85446548461914, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.87154221534729, "num_tokens": 653159390.0, "step": 17120 }, { "epoch": 2.1779671797481237, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.70201873779297, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8536324501037598, "num_tokens": 653201045.0, "step": 17121 }, { "epoch": 2.1780943900267142, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.46641540527344, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8650863766670227, "num_tokens": 653244968.0, "step": 17122 }, { "epoch": 2.1782216003053048, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.684303283691406, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8554522395133972, "num_tokens": 653281157.0, "step": 17123 }, { "epoch": 2.1783488105838953, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.490806579589844, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8754301071166992, "num_tokens": 653321055.0, "step": 17124 }, { "epoch": 2.178476020862486, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.17327880859375, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8498162031173706, "num_tokens": 653359320.0, "step": 17125 }, { "epoch": 2.1786032311410763, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.996036529541016, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8828760981559753, "num_tokens": 653404186.0, "step": 17126 }, { "epoch": 2.178730441419667, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.47718048095703, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8719314932823181, "num_tokens": 653438122.0, "step": 17127 }, { "epoch": 2.1788576516982574, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.20442199707031, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8615656495094299, "num_tokens": 653474727.0, "step": 17128 }, { "epoch": 2.178984861976848, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.15618133544922, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8679283261299133, "num_tokens": 653513531.0, "step": 17129 }, { "epoch": 2.179112072255438, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.08211898803711, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8760223984718323, "num_tokens": 653555871.0, "step": 17130 }, { "epoch": 2.1792392825340285, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.0289306640625, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8646818399429321, "num_tokens": 653598825.0, "step": 17131 }, { "epoch": 2.179366492812619, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.507747650146484, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.871907114982605, "num_tokens": 653639838.0, "step": 17132 }, { "epoch": 2.1794937030912096, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.49064254760742, "learning_rate": 1e-06, "loss": 0.6078, "mean_token_accuracy": 0.854723334312439, "num_tokens": 653678958.0, "step": 17133 }, { "epoch": 2.1796209133698, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.18474578857422, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8707479238510132, "num_tokens": 653718043.0, "step": 17134 }, { "epoch": 2.1797481236483907, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.2703857421875, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8790783286094666, "num_tokens": 653756308.0, "step": 17135 }, { "epoch": 2.179875333926981, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7319221496582, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8707336187362671, "num_tokens": 653793140.0, "step": 17136 }, { "epoch": 2.1800025442055717, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.01816940307617, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8662493824958801, "num_tokens": 653823441.0, "step": 17137 }, { "epoch": 2.1801297544841622, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.4680061340332, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8535565137863159, "num_tokens": 653860743.0, "step": 17138 }, { "epoch": 2.1802569647627528, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.46684265136719, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8741706013679504, "num_tokens": 653894785.0, "step": 17139 }, { "epoch": 2.1803841750413433, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.31739044189453, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8592876195907593, "num_tokens": 653932208.0, "step": 17140 }, { "epoch": 2.180511385319934, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.337196350097656, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8485105037689209, "num_tokens": 653968818.0, "step": 17141 }, { "epoch": 2.1806385955985244, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.28407287597656, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8610120415687561, "num_tokens": 654010696.0, "step": 17142 }, { "epoch": 2.180765805877115, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.71664047241211, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.866864025592804, "num_tokens": 654047632.0, "step": 17143 }, { "epoch": 2.1808930161557054, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.14729690551758, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8665727376937866, "num_tokens": 654088200.0, "step": 17144 }, { "epoch": 2.181020226434296, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.082969665527344, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8601816892623901, "num_tokens": 654124225.0, "step": 17145 }, { "epoch": 2.1811474367128865, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.56269454956055, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8652273416519165, "num_tokens": 654163317.0, "step": 17146 }, { "epoch": 2.181274646991477, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.61302947998047, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8561334609985352, "num_tokens": 654199208.0, "step": 17147 }, { "epoch": 2.1814018572700675, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.20853805541992, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.851332426071167, "num_tokens": 654239988.0, "step": 17148 }, { "epoch": 2.181529067548658, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.688331604003906, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8552234172821045, "num_tokens": 654276361.0, "step": 17149 }, { "epoch": 2.1816562778272486, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.31520462036133, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8641409873962402, "num_tokens": 654316690.0, "step": 17150 }, { "epoch": 2.181783488105839, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.487709045410156, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.872986912727356, "num_tokens": 654354112.0, "step": 17151 }, { "epoch": 2.1819106983844296, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.24296569824219, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8750833868980408, "num_tokens": 654398896.0, "step": 17152 }, { "epoch": 2.18203790866302, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.17744445800781, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8768976926803589, "num_tokens": 654435180.0, "step": 17153 }, { "epoch": 2.1821651189416107, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.31473922729492, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8508052825927734, "num_tokens": 654473958.0, "step": 17154 }, { "epoch": 2.1822923292202008, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.183433532714844, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8488844633102417, "num_tokens": 654512918.0, "step": 17155 }, { "epoch": 2.1824195394987913, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.139122009277344, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8695608377456665, "num_tokens": 654552296.0, "step": 17156 }, { "epoch": 2.182546749777382, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.26451110839844, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8686907887458801, "num_tokens": 654592088.0, "step": 17157 }, { "epoch": 2.1826739600559724, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.30948257446289, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8555729985237122, "num_tokens": 654635119.0, "step": 17158 }, { "epoch": 2.182801170334563, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.28556823730469, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.851621150970459, "num_tokens": 654671204.0, "step": 17159 }, { "epoch": 2.1829283806131534, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.322505950927734, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8620189428329468, "num_tokens": 654702986.0, "step": 17160 }, { "epoch": 2.183055590891744, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.30082321166992, "learning_rate": 1e-06, "loss": 0.6642, "mean_token_accuracy": 0.8379448652267456, "num_tokens": 654745813.0, "step": 17161 }, { "epoch": 2.1831828011703345, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.4377555847168, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8608435392379761, "num_tokens": 654783305.0, "step": 17162 }, { "epoch": 2.183310011448925, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.47964859008789, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8706371188163757, "num_tokens": 654823427.0, "step": 17163 }, { "epoch": 2.1834372217275155, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.89207077026367, "learning_rate": 1e-06, "loss": 0.5552, "mean_token_accuracy": 0.8772287964820862, "num_tokens": 654862400.0, "step": 17164 }, { "epoch": 2.183564432006106, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.35391616821289, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8676506280899048, "num_tokens": 654902484.0, "step": 17165 }, { "epoch": 2.1836916422846966, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.03275680541992, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8687505722045898, "num_tokens": 654946882.0, "step": 17166 }, { "epoch": 2.183818852563287, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.72725296020508, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8525940775871277, "num_tokens": 654977850.0, "step": 17167 }, { "epoch": 2.1839460628418776, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.22714614868164, "learning_rate": 1e-06, "loss": 0.6432, "mean_token_accuracy": 0.8479537963867188, "num_tokens": 655015352.0, "step": 17168 }, { "epoch": 2.184073273120468, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.77178192138672, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8625806570053101, "num_tokens": 655047488.0, "step": 17169 }, { "epoch": 2.1842004833990587, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.57842254638672, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8600547909736633, "num_tokens": 655083410.0, "step": 17170 }, { "epoch": 2.1843276936776492, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.96805191040039, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8711527585983276, "num_tokens": 655120594.0, "step": 17171 }, { "epoch": 2.1844549039562398, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.85000228881836, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.860862135887146, "num_tokens": 655157310.0, "step": 17172 }, { "epoch": 2.1845821142348303, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.59335708618164, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.8708130121231079, "num_tokens": 655193749.0, "step": 17173 }, { "epoch": 2.184709324513421, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.76406478881836, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8779549598693848, "num_tokens": 655235532.0, "step": 17174 }, { "epoch": 2.1848365347920113, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.939815521240234, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.869666337966919, "num_tokens": 655282154.0, "step": 17175 }, { "epoch": 2.184963745070602, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.077056884765625, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8547625541687012, "num_tokens": 655321342.0, "step": 17176 }, { "epoch": 2.1850909553491924, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.683509826660156, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.880811333656311, "num_tokens": 655358107.0, "step": 17177 }, { "epoch": 2.185218165627783, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.2663459777832, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8572819232940674, "num_tokens": 655393462.0, "step": 17178 }, { "epoch": 2.1853453759063735, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.50618362426758, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8642867207527161, "num_tokens": 655436114.0, "step": 17179 }, { "epoch": 2.1854725861849635, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.50855255126953, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8638191819190979, "num_tokens": 655473119.0, "step": 17180 }, { "epoch": 2.185599796463554, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 46.71600341796875, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8573513031005859, "num_tokens": 655515537.0, "step": 17181 }, { "epoch": 2.1857270067421446, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.83730697631836, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.856523334980011, "num_tokens": 655555944.0, "step": 17182 }, { "epoch": 2.185854217020735, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.55527114868164, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.844875693321228, "num_tokens": 655589952.0, "step": 17183 }, { "epoch": 2.1859814272993257, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.45383834838867, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8743947744369507, "num_tokens": 655628269.0, "step": 17184 }, { "epoch": 2.186108637577916, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 46.96919250488281, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8668267130851746, "num_tokens": 655657315.0, "step": 17185 }, { "epoch": 2.1862358478565067, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7790641784668, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8621934652328491, "num_tokens": 655696117.0, "step": 17186 }, { "epoch": 2.1863630581350972, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.0965461730957, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8441030979156494, "num_tokens": 655738823.0, "step": 17187 }, { "epoch": 2.1864902684136878, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.64344787597656, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8527617454528809, "num_tokens": 655781457.0, "step": 17188 }, { "epoch": 2.1866174786922783, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.137996673583984, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8574455976486206, "num_tokens": 655824704.0, "step": 17189 }, { "epoch": 2.186744688970869, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.459957122802734, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8725214004516602, "num_tokens": 655860013.0, "step": 17190 }, { "epoch": 2.1868718992494594, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.33537292480469, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8601770997047424, "num_tokens": 655897146.0, "step": 17191 }, { "epoch": 2.18699910952805, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.26271438598633, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.8673983216285706, "num_tokens": 655934299.0, "step": 17192 }, { "epoch": 2.1871263198066404, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.5870361328125, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8667706251144409, "num_tokens": 655976554.0, "step": 17193 }, { "epoch": 2.187253530085231, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.54030227661133, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.874541163444519, "num_tokens": 656011756.0, "step": 17194 }, { "epoch": 2.1873807403638215, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.39728546142578, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8610736131668091, "num_tokens": 656054405.0, "step": 17195 }, { "epoch": 2.187507950642412, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.42595672607422, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8669299483299255, "num_tokens": 656094668.0, "step": 17196 }, { "epoch": 2.1876351609210025, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.398990631103516, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.861559271812439, "num_tokens": 656137664.0, "step": 17197 }, { "epoch": 2.187762371199593, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.803707122802734, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8554269075393677, "num_tokens": 656180611.0, "step": 17198 }, { "epoch": 2.1878895814781836, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.92140579223633, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8529602885246277, "num_tokens": 656217789.0, "step": 17199 }, { "epoch": 2.188016791756774, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.85940933227539, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8583569526672363, "num_tokens": 656261940.0, "step": 17200 }, { "epoch": 2.1881440020353646, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.52175521850586, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8717027902603149, "num_tokens": 656296000.0, "step": 17201 }, { "epoch": 2.188271212313955, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 46.93553924560547, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8736884593963623, "num_tokens": 656340862.0, "step": 17202 }, { "epoch": 2.1883984225925452, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.65946960449219, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8886458277702332, "num_tokens": 656376923.0, "step": 17203 }, { "epoch": 2.188525632871136, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.231605529785156, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8587843775749207, "num_tokens": 656414058.0, "step": 17204 }, { "epoch": 2.1886528431497263, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.688270568847656, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8531643152236938, "num_tokens": 656450077.0, "step": 17205 }, { "epoch": 2.188780053428317, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.68149948120117, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8543553352355957, "num_tokens": 656487258.0, "step": 17206 }, { "epoch": 2.1889072637069074, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.09530258178711, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8753230571746826, "num_tokens": 656525141.0, "step": 17207 }, { "epoch": 2.189034473985498, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.13505554199219, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8664530515670776, "num_tokens": 656561517.0, "step": 17208 }, { "epoch": 2.1891616842640884, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.84751892089844, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8692505359649658, "num_tokens": 656595859.0, "step": 17209 }, { "epoch": 2.189288894542679, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.38429641723633, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8611925840377808, "num_tokens": 656627652.0, "step": 17210 }, { "epoch": 2.1894161048212695, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.598594665527344, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8687061071395874, "num_tokens": 656669951.0, "step": 17211 }, { "epoch": 2.18954331509986, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.659217834472656, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8741902112960815, "num_tokens": 656702216.0, "step": 17212 }, { "epoch": 2.1896705253784505, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.16005325317383, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8535910844802856, "num_tokens": 656744816.0, "step": 17213 }, { "epoch": 2.189797735657041, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.47441482543945, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8590400815010071, "num_tokens": 656785533.0, "step": 17214 }, { "epoch": 2.1899249459356316, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.62257766723633, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8609458804130554, "num_tokens": 656825375.0, "step": 17215 }, { "epoch": 2.190052156214222, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.540035247802734, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8707785606384277, "num_tokens": 656857951.0, "step": 17216 }, { "epoch": 2.1901793664928126, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.238006591796875, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.87157142162323, "num_tokens": 656896911.0, "step": 17217 }, { "epoch": 2.190306576771403, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.632102966308594, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8698453903198242, "num_tokens": 656935723.0, "step": 17218 }, { "epoch": 2.1904337870499937, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.33331298828125, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.876428484916687, "num_tokens": 656970881.0, "step": 17219 }, { "epoch": 2.1905609973285842, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.34306716918945, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8576748371124268, "num_tokens": 657007721.0, "step": 17220 }, { "epoch": 2.1906882076071748, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.0162239074707, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8724728226661682, "num_tokens": 657042359.0, "step": 17221 }, { "epoch": 2.1908154178857653, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.95383071899414, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8684132099151611, "num_tokens": 657084662.0, "step": 17222 }, { "epoch": 2.190942628164356, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.027462005615234, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8643594980239868, "num_tokens": 657122002.0, "step": 17223 }, { "epoch": 2.1910698384429463, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.6616096496582, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8639945983886719, "num_tokens": 657158768.0, "step": 17224 }, { "epoch": 2.191197048721537, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.018592834472656, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8518857359886169, "num_tokens": 657193445.0, "step": 17225 }, { "epoch": 2.1913242590001274, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.698394775390625, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.862477719783783, "num_tokens": 657234922.0, "step": 17226 }, { "epoch": 2.191451469278718, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.17348861694336, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.853725016117096, "num_tokens": 657270774.0, "step": 17227 }, { "epoch": 2.191578679557308, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.24748611450195, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8786291480064392, "num_tokens": 657307046.0, "step": 17228 }, { "epoch": 2.1917058898358985, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.91402816772461, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8521578311920166, "num_tokens": 657352865.0, "step": 17229 }, { "epoch": 2.191833100114489, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.7210807800293, "learning_rate": 1e-06, "loss": 0.6573, "mean_token_accuracy": 0.8423656821250916, "num_tokens": 657393372.0, "step": 17230 }, { "epoch": 2.1919603103930796, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001544952392578125, "grad_norm": 46.73412322998047, "learning_rate": 1e-06, "loss": 0.6488, "mean_token_accuracy": 0.8442362546920776, "num_tokens": 657432445.0, "step": 17231 }, { "epoch": 2.19208752067167, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.07794189453125, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8626682758331299, "num_tokens": 657469145.0, "step": 17232 }, { "epoch": 2.1922147309502606, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 46.93912887573242, "learning_rate": 1e-06, "loss": 0.5467, "mean_token_accuracy": 0.8770127892494202, "num_tokens": 657510369.0, "step": 17233 }, { "epoch": 2.192341941228851, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.933475494384766, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8690001964569092, "num_tokens": 657550257.0, "step": 17234 }, { "epoch": 2.1924691515074417, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.13153076171875, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8649181127548218, "num_tokens": 657587095.0, "step": 17235 }, { "epoch": 2.1925963617860322, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.61170959472656, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.867843508720398, "num_tokens": 657619710.0, "step": 17236 }, { "epoch": 2.1927235720646228, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.37104034423828, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8594802021980286, "num_tokens": 657653316.0, "step": 17237 }, { "epoch": 2.1928507823432133, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.45111083984375, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8582276701927185, "num_tokens": 657693570.0, "step": 17238 }, { "epoch": 2.192977992621804, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.30047607421875, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8644081354141235, "num_tokens": 657736781.0, "step": 17239 }, { "epoch": 2.1931052029003943, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.923667907714844, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8543500900268555, "num_tokens": 657775603.0, "step": 17240 }, { "epoch": 2.193232413178985, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.9352912902832, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8618590831756592, "num_tokens": 657809065.0, "step": 17241 }, { "epoch": 2.1933596234575754, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.79755783081055, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8615918159484863, "num_tokens": 657851369.0, "step": 17242 }, { "epoch": 2.193486833736166, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.057228088378906, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8863353729248047, "num_tokens": 657889008.0, "step": 17243 }, { "epoch": 2.1936140440147565, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.76150894165039, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8623164892196655, "num_tokens": 657925867.0, "step": 17244 }, { "epoch": 2.193741254293347, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.01306915283203, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8529816269874573, "num_tokens": 657959502.0, "step": 17245 }, { "epoch": 2.1938684645719375, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.90704345703125, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8657377362251282, "num_tokens": 657999063.0, "step": 17246 }, { "epoch": 2.193995674850528, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.35694885253906, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8520181179046631, "num_tokens": 658033016.0, "step": 17247 }, { "epoch": 2.1941228851291186, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.660728454589844, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.869080662727356, "num_tokens": 658071259.0, "step": 17248 }, { "epoch": 2.194250095407709, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.29742431640625, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8647665977478027, "num_tokens": 658103190.0, "step": 17249 }, { "epoch": 2.1943773056862996, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.46725082397461, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8555721044540405, "num_tokens": 658143346.0, "step": 17250 }, { "epoch": 2.19450451596489, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.19496154785156, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8739809989929199, "num_tokens": 658181747.0, "step": 17251 }, { "epoch": 2.1946317262434807, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.53950500488281, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8630817532539368, "num_tokens": 658221770.0, "step": 17252 }, { "epoch": 2.1947589365220708, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.8183479309082, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.868261456489563, "num_tokens": 658260157.0, "step": 17253 }, { "epoch": 2.1948861468006613, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.74473190307617, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8523415923118591, "num_tokens": 658301815.0, "step": 17254 }, { "epoch": 2.195013357079252, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.800228118896484, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8659998774528503, "num_tokens": 658340732.0, "step": 17255 }, { "epoch": 2.1951405673578424, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.96799087524414, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8574036955833435, "num_tokens": 658380569.0, "step": 17256 }, { "epoch": 2.195267777636433, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.889286041259766, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8765876293182373, "num_tokens": 658415842.0, "step": 17257 }, { "epoch": 2.1953949879150234, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.18703079223633, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8681561946868896, "num_tokens": 658452861.0, "step": 17258 }, { "epoch": 2.195522198193614, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.308773040771484, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8661883473396301, "num_tokens": 658487019.0, "step": 17259 }, { "epoch": 2.1956494084722045, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.42819595336914, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8631504774093628, "num_tokens": 658529290.0, "step": 17260 }, { "epoch": 2.195776618750795, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.53671646118164, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8577894568443298, "num_tokens": 658572118.0, "step": 17261 }, { "epoch": 2.1959038290293855, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.459510803222656, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8605412840843201, "num_tokens": 658615972.0, "step": 17262 }, { "epoch": 2.196031039307976, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.33287811279297, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8722814321517944, "num_tokens": 658653848.0, "step": 17263 }, { "epoch": 2.1961582495865666, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.52606201171875, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8652461767196655, "num_tokens": 658686247.0, "step": 17264 }, { "epoch": 2.196285459865157, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.49116516113281, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8574569821357727, "num_tokens": 658725787.0, "step": 17265 }, { "epoch": 2.1964126701437476, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.27505874633789, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8586682081222534, "num_tokens": 658765957.0, "step": 17266 }, { "epoch": 2.196539880422338, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.455013275146484, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.874609649181366, "num_tokens": 658800996.0, "step": 17267 }, { "epoch": 2.1966670907009287, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.24679946899414, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8598639965057373, "num_tokens": 658833779.0, "step": 17268 }, { "epoch": 2.196794300979519, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.801124572753906, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.867062509059906, "num_tokens": 658874425.0, "step": 17269 }, { "epoch": 2.1969215112581097, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.254493713378906, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8706856966018677, "num_tokens": 658912007.0, "step": 17270 }, { "epoch": 2.1970487215367003, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.400428771972656, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8548819422721863, "num_tokens": 658956775.0, "step": 17271 }, { "epoch": 2.197175931815291, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.75893783569336, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8764920830726624, "num_tokens": 658997132.0, "step": 17272 }, { "epoch": 2.1973031420938813, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.98854064941406, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.8701760172843933, "num_tokens": 659040437.0, "step": 17273 }, { "epoch": 2.197430352372472, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.166316986083984, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8792742490768433, "num_tokens": 659074461.0, "step": 17274 }, { "epoch": 2.1975575626510624, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.95572280883789, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8605036735534668, "num_tokens": 659113818.0, "step": 17275 }, { "epoch": 2.197684772929653, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.46477508544922, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8478553295135498, "num_tokens": 659149297.0, "step": 17276 }, { "epoch": 2.1978119832082434, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.305572509765625, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8669381141662598, "num_tokens": 659187500.0, "step": 17277 }, { "epoch": 2.1979391934868335, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.56169509887695, "learning_rate": 1e-06, "loss": 0.6814, "mean_token_accuracy": 0.8356794118881226, "num_tokens": 659228624.0, "step": 17278 }, { "epoch": 2.198066403765424, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.55345916748047, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8709701299667358, "num_tokens": 659270317.0, "step": 17279 }, { "epoch": 2.1981936140440146, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.1416015625, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8618539571762085, "num_tokens": 659302133.0, "step": 17280 }, { "epoch": 2.198320824322605, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.6790885925293, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8711509704589844, "num_tokens": 659342469.0, "step": 17281 }, { "epoch": 2.1984480346011956, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.89101791381836, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8759028911590576, "num_tokens": 659373786.0, "step": 17282 }, { "epoch": 2.198575244879786, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.135868072509766, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8548470735549927, "num_tokens": 659412498.0, "step": 17283 }, { "epoch": 2.1987024551583767, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.420082092285156, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8469845056533813, "num_tokens": 659446415.0, "step": 17284 }, { "epoch": 2.1988296654369672, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.62823486328125, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8556898832321167, "num_tokens": 659488824.0, "step": 17285 }, { "epoch": 2.1989568757155578, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 46.70568084716797, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.866529107093811, "num_tokens": 659523036.0, "step": 17286 }, { "epoch": 2.1990840859941483, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.795631408691406, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8575752377510071, "num_tokens": 659562780.0, "step": 17287 }, { "epoch": 2.199211296272739, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.09454345703125, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8698430061340332, "num_tokens": 659599364.0, "step": 17288 }, { "epoch": 2.1993385065513293, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.9781379699707, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.857251763343811, "num_tokens": 659636275.0, "step": 17289 }, { "epoch": 2.19946571682992, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.31950378417969, "learning_rate": 1e-06, "loss": 0.5704, "mean_token_accuracy": 0.8627634048461914, "num_tokens": 659670315.0, "step": 17290 }, { "epoch": 2.1995929271085104, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 46.96712875366211, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8588445782661438, "num_tokens": 659710968.0, "step": 17291 }, { "epoch": 2.199720137387101, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.94619369506836, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8729449510574341, "num_tokens": 659751334.0, "step": 17292 }, { "epoch": 2.1998473476656915, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.9892463684082, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8646349906921387, "num_tokens": 659793867.0, "step": 17293 }, { "epoch": 2.199974557944282, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.559844970703125, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8645409345626831, "num_tokens": 659838724.0, "step": 17294 }, { "epoch": 2.2001017682228725, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.88175582885742, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8615016937255859, "num_tokens": 659876589.0, "step": 17295 }, { "epoch": 2.200228978501463, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.30280685424805, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8549108505249023, "num_tokens": 659914864.0, "step": 17296 }, { "epoch": 2.2003561887800536, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 46.987667083740234, "learning_rate": 1e-06, "loss": 0.5539, "mean_token_accuracy": 0.870669960975647, "num_tokens": 659952482.0, "step": 17297 }, { "epoch": 2.200483399058644, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.517608642578125, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8617845773696899, "num_tokens": 659985192.0, "step": 17298 }, { "epoch": 2.2006106093372346, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.41490936279297, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.8715629577636719, "num_tokens": 660019505.0, "step": 17299 }, { "epoch": 2.200737819615825, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.244384765625, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8547413945198059, "num_tokens": 660054942.0, "step": 17300 }, { "epoch": 2.2008650298944152, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.56414794921875, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8601071834564209, "num_tokens": 660092836.0, "step": 17301 }, { "epoch": 2.200992240173006, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.29352951049805, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8589159250259399, "num_tokens": 660129419.0, "step": 17302 }, { "epoch": 2.2011194504515963, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.62779235839844, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8664112687110901, "num_tokens": 660168116.0, "step": 17303 }, { "epoch": 2.201246660730187, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.96976852416992, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8571555018424988, "num_tokens": 660208006.0, "step": 17304 }, { "epoch": 2.2013738710087773, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.56101608276367, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8632481098175049, "num_tokens": 660254046.0, "step": 17305 }, { "epoch": 2.201501081287368, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.255287170410156, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8616214394569397, "num_tokens": 660289321.0, "step": 17306 }, { "epoch": 2.2016282915659584, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.37777328491211, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8604768514633179, "num_tokens": 660325052.0, "step": 17307 }, { "epoch": 2.201755501844549, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.33559036254883, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8539512157440186, "num_tokens": 660357942.0, "step": 17308 }, { "epoch": 2.2018827121231395, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.28999328613281, "learning_rate": 1e-06, "loss": 0.6711, "mean_token_accuracy": 0.8376152515411377, "num_tokens": 660390654.0, "step": 17309 }, { "epoch": 2.20200992240173, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.337833404541016, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8626794219017029, "num_tokens": 660432020.0, "step": 17310 }, { "epoch": 2.2021371326803205, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.35177230834961, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8638535737991333, "num_tokens": 660465051.0, "step": 17311 }, { "epoch": 2.202264342958911, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.43062210083008, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8579064607620239, "num_tokens": 660504967.0, "step": 17312 }, { "epoch": 2.2023915532375016, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.14278793334961, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8523485660552979, "num_tokens": 660543826.0, "step": 17313 }, { "epoch": 2.202518763516092, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.703758239746094, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8572992086410522, "num_tokens": 660583074.0, "step": 17314 }, { "epoch": 2.2026459737946826, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 46.995670318603516, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8806475400924683, "num_tokens": 660619974.0, "step": 17315 }, { "epoch": 2.202773184073273, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.07270050048828, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8690251111984253, "num_tokens": 660654005.0, "step": 17316 }, { "epoch": 2.2029003943518637, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.93629455566406, "learning_rate": 1e-06, "loss": 0.6272, "mean_token_accuracy": 0.8515422344207764, "num_tokens": 660689179.0, "step": 17317 }, { "epoch": 2.203027604630454, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.9898567199707, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8508820533752441, "num_tokens": 660724923.0, "step": 17318 }, { "epoch": 2.2031548149090447, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.334228515625, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8667566776275635, "num_tokens": 660760763.0, "step": 17319 }, { "epoch": 2.2032820251876353, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.42844009399414, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8630560636520386, "num_tokens": 660795520.0, "step": 17320 }, { "epoch": 2.203409235466226, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.34779739379883, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8711671233177185, "num_tokens": 660834858.0, "step": 17321 }, { "epoch": 2.2035364457448163, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.79029083251953, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8702144026756287, "num_tokens": 660870319.0, "step": 17322 }, { "epoch": 2.203663656023407, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.671653747558594, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8495515584945679, "num_tokens": 660907313.0, "step": 17323 }, { "epoch": 2.2037908663019974, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.29006576538086, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8775548934936523, "num_tokens": 660948087.0, "step": 17324 }, { "epoch": 2.203918076580588, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.94149398803711, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.866558313369751, "num_tokens": 660984984.0, "step": 17325 }, { "epoch": 2.204045286859178, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.08263397216797, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8603231310844421, "num_tokens": 661022898.0, "step": 17326 }, { "epoch": 2.2041724971377685, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.057884216308594, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8725976943969727, "num_tokens": 661058645.0, "step": 17327 }, { "epoch": 2.204299707416359, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.785499572753906, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8854352235794067, "num_tokens": 661096954.0, "step": 17328 }, { "epoch": 2.2044269176949496, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.38210678100586, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8527925610542297, "num_tokens": 661136947.0, "step": 17329 }, { "epoch": 2.20455412797354, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.27602767944336, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.863511860370636, "num_tokens": 661175376.0, "step": 17330 }, { "epoch": 2.2046813382521306, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.99810028076172, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8565085530281067, "num_tokens": 661213238.0, "step": 17331 }, { "epoch": 2.204808548530721, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.32172775268555, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8795216083526611, "num_tokens": 661246512.0, "step": 17332 }, { "epoch": 2.2049357588093117, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.569122314453125, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8749935626983643, "num_tokens": 661288272.0, "step": 17333 }, { "epoch": 2.2050629690879022, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.080379486083984, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8699597716331482, "num_tokens": 661328705.0, "step": 17334 }, { "epoch": 2.2051901793664928, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.02742385864258, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8662084937095642, "num_tokens": 661369877.0, "step": 17335 }, { "epoch": 2.2053173896450833, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.06926727294922, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8661421537399292, "num_tokens": 661405209.0, "step": 17336 }, { "epoch": 2.205444599923674, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.162113189697266, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8683900237083435, "num_tokens": 661441083.0, "step": 17337 }, { "epoch": 2.2055718102022643, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.933876037597656, "learning_rate": 1e-06, "loss": 0.5812, "mean_token_accuracy": 0.8649438619613647, "num_tokens": 661476938.0, "step": 17338 }, { "epoch": 2.205699020480855, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 46.76265335083008, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8551868200302124, "num_tokens": 661519281.0, "step": 17339 }, { "epoch": 2.2058262307594454, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.981605529785156, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8798627257347107, "num_tokens": 661555880.0, "step": 17340 }, { "epoch": 2.205953441038036, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.21982955932617, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8554588556289673, "num_tokens": 661595937.0, "step": 17341 }, { "epoch": 2.2060806513166265, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.120426177978516, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8488364219665527, "num_tokens": 661636868.0, "step": 17342 }, { "epoch": 2.206207861595217, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.590614318847656, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8549038171768188, "num_tokens": 661681503.0, "step": 17343 }, { "epoch": 2.2063350718738075, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7532844543457, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8654899597167969, "num_tokens": 661718438.0, "step": 17344 }, { "epoch": 2.206462282152398, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.62152862548828, "learning_rate": 1e-06, "loss": 0.6453, "mean_token_accuracy": 0.843646228313446, "num_tokens": 661759048.0, "step": 17345 }, { "epoch": 2.2065894924309886, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.69694137573242, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8737670183181763, "num_tokens": 661803131.0, "step": 17346 }, { "epoch": 2.206716702709579, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.227359771728516, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8636994361877441, "num_tokens": 661838139.0, "step": 17347 }, { "epoch": 2.2068439129881696, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.8808479309082, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.856479823589325, "num_tokens": 661878568.0, "step": 17348 }, { "epoch": 2.20697112326676, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.36345672607422, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.856541097164154, "num_tokens": 661909110.0, "step": 17349 }, { "epoch": 2.2070983335453507, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.12991714477539, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8767136931419373, "num_tokens": 661944472.0, "step": 17350 }, { "epoch": 2.2072255438239408, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.21035385131836, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8751471042633057, "num_tokens": 661985865.0, "step": 17351 }, { "epoch": 2.2073527541025313, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.36452865600586, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8585268259048462, "num_tokens": 662019460.0, "step": 17352 }, { "epoch": 2.207479964381122, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.00920486450195, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8708963990211487, "num_tokens": 662059109.0, "step": 17353 }, { "epoch": 2.2076071746597123, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 46.747833251953125, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8546000719070435, "num_tokens": 662096040.0, "step": 17354 }, { "epoch": 2.207734384938303, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.365848541259766, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8687334656715393, "num_tokens": 662139351.0, "step": 17355 }, { "epoch": 2.2078615952168934, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.1683349609375, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8649242520332336, "num_tokens": 662177353.0, "step": 17356 }, { "epoch": 2.207988805495484, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.0665283203125, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.851689338684082, "num_tokens": 662209519.0, "step": 17357 }, { "epoch": 2.2081160157740745, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.30877685546875, "learning_rate": 1e-06, "loss": 0.6475, "mean_token_accuracy": 0.8448421955108643, "num_tokens": 662251311.0, "step": 17358 }, { "epoch": 2.208243226052665, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.734130859375, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8701726198196411, "num_tokens": 662282770.0, "step": 17359 }, { "epoch": 2.2083704363312555, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.357398986816406, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.858681321144104, "num_tokens": 662326770.0, "step": 17360 }, { "epoch": 2.208497646609846, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.473819732666016, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8628093004226685, "num_tokens": 662366811.0, "step": 17361 }, { "epoch": 2.2086248568884366, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.36924743652344, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8598941564559937, "num_tokens": 662404517.0, "step": 17362 }, { "epoch": 2.208752067167027, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.78045654296875, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8679742813110352, "num_tokens": 662442091.0, "step": 17363 }, { "epoch": 2.2088792774456176, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.00995635986328, "learning_rate": 1e-06, "loss": 0.6627, "mean_token_accuracy": 0.8378448486328125, "num_tokens": 662483343.0, "step": 17364 }, { "epoch": 2.209006487724208, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.08007049560547, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8557161092758179, "num_tokens": 662521994.0, "step": 17365 }, { "epoch": 2.2091336980027987, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.26708984375, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8587731122970581, "num_tokens": 662562737.0, "step": 17366 }, { "epoch": 2.209260908281389, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.046695709228516, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8585888147354126, "num_tokens": 662596927.0, "step": 17367 }, { "epoch": 2.2093881185599797, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.63358688354492, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8724333643913269, "num_tokens": 662632831.0, "step": 17368 }, { "epoch": 2.2095153288385703, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.648250579833984, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8587572574615479, "num_tokens": 662675554.0, "step": 17369 }, { "epoch": 2.209642539117161, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.10699462890625, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8759625554084778, "num_tokens": 662720248.0, "step": 17370 }, { "epoch": 2.2097697493957513, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.685001373291016, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8634315729141235, "num_tokens": 662761388.0, "step": 17371 }, { "epoch": 2.209896959674342, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.41939163208008, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8617217540740967, "num_tokens": 662798007.0, "step": 17372 }, { "epoch": 2.2100241699529324, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.53420639038086, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8596245646476746, "num_tokens": 662836042.0, "step": 17373 }, { "epoch": 2.210151380231523, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.679649353027344, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8527765870094299, "num_tokens": 662869465.0, "step": 17374 }, { "epoch": 2.2102785905101134, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.6219482421875, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8552746176719666, "num_tokens": 662909440.0, "step": 17375 }, { "epoch": 2.2104058007887035, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.36517333984375, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.8602133989334106, "num_tokens": 662949637.0, "step": 17376 }, { "epoch": 2.210533011067294, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.898616790771484, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8600161075592041, "num_tokens": 662987679.0, "step": 17377 }, { "epoch": 2.2106602213458846, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.652587890625, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8431442379951477, "num_tokens": 663027651.0, "step": 17378 }, { "epoch": 2.210787431624475, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.83342361450195, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8764761686325073, "num_tokens": 663063708.0, "step": 17379 }, { "epoch": 2.2109146419030656, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.912078857421875, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8535251617431641, "num_tokens": 663097255.0, "step": 17380 }, { "epoch": 2.211041852181656, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.37554168701172, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8589482307434082, "num_tokens": 663134496.0, "step": 17381 }, { "epoch": 2.2111690624602467, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.0532112121582, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8722145557403564, "num_tokens": 663170278.0, "step": 17382 }, { "epoch": 2.211296272738837, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.49052810668945, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.860792875289917, "num_tokens": 663205407.0, "step": 17383 }, { "epoch": 2.2114234830174277, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.967201232910156, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8674134612083435, "num_tokens": 663243166.0, "step": 17384 }, { "epoch": 2.2115506932960183, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.341041564941406, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8851669430732727, "num_tokens": 663282267.0, "step": 17385 }, { "epoch": 2.211677903574609, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.207576751708984, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8538830280303955, "num_tokens": 663318620.0, "step": 17386 }, { "epoch": 2.2118051138531993, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.28664779663086, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8800894618034363, "num_tokens": 663353256.0, "step": 17387 }, { "epoch": 2.21193232413179, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.08591842651367, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.865880012512207, "num_tokens": 663386691.0, "step": 17388 }, { "epoch": 2.2120595344103804, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.53349304199219, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8592855334281921, "num_tokens": 663416550.0, "step": 17389 }, { "epoch": 2.212186744688971, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.013526916503906, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8657864332199097, "num_tokens": 663464912.0, "step": 17390 }, { "epoch": 2.2123139549675614, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.07821273803711, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8543766140937805, "num_tokens": 663503786.0, "step": 17391 }, { "epoch": 2.212441165246152, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.766502380371094, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8701143264770508, "num_tokens": 663540579.0, "step": 17392 }, { "epoch": 2.2125683755247425, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.602359771728516, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8641526699066162, "num_tokens": 663577284.0, "step": 17393 }, { "epoch": 2.212695585803333, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.76127243041992, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8568223714828491, "num_tokens": 663613118.0, "step": 17394 }, { "epoch": 2.2128227960819236, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.76358413696289, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.8681210279464722, "num_tokens": 663651548.0, "step": 17395 }, { "epoch": 2.212950006360514, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.7452392578125, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8448317050933838, "num_tokens": 663694381.0, "step": 17396 }, { "epoch": 2.2130772166391046, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 48.648616790771484, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8649784922599792, "num_tokens": 663734686.0, "step": 17397 }, { "epoch": 2.213204426917695, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.967899322509766, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.854427695274353, "num_tokens": 663768764.0, "step": 17398 }, { "epoch": 2.2133316371962852, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.621707916259766, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8832929730415344, "num_tokens": 663807650.0, "step": 17399 }, { "epoch": 2.213458847474876, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.342796325683594, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8607797026634216, "num_tokens": 663849308.0, "step": 17400 }, { "epoch": 2.2135860577534663, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.44636154174805, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8477625846862793, "num_tokens": 663888917.0, "step": 17401 }, { "epoch": 2.213713268032057, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.091060638427734, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8616007566452026, "num_tokens": 663921048.0, "step": 17402 }, { "epoch": 2.2138404783106473, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.48594284057617, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8667222261428833, "num_tokens": 663963433.0, "step": 17403 }, { "epoch": 2.213967688589238, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.65000534057617, "learning_rate": 1e-06, "loss": 0.6479, "mean_token_accuracy": 0.8446012139320374, "num_tokens": 664003294.0, "step": 17404 }, { "epoch": 2.2140948988678284, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.13198471069336, "learning_rate": 1e-06, "loss": 0.6245, "mean_token_accuracy": 0.8520538806915283, "num_tokens": 664043161.0, "step": 17405 }, { "epoch": 2.214222109146419, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.454498291015625, "learning_rate": 1e-06, "loss": 0.5477, "mean_token_accuracy": 0.8736402988433838, "num_tokens": 664082457.0, "step": 17406 }, { "epoch": 2.2143493194250095, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.979732513427734, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8741491436958313, "num_tokens": 664125999.0, "step": 17407 }, { "epoch": 2.2144765297036, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.630126953125, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.869475245475769, "num_tokens": 664167256.0, "step": 17408 }, { "epoch": 2.2146037399821905, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.98979187011719, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8601130247116089, "num_tokens": 664201887.0, "step": 17409 }, { "epoch": 2.214730950260781, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.379947662353516, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8678263425827026, "num_tokens": 664238986.0, "step": 17410 }, { "epoch": 2.2148581605393716, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.444698333740234, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8580409288406372, "num_tokens": 664283436.0, "step": 17411 }, { "epoch": 2.214985370817962, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.505462646484375, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8562508821487427, "num_tokens": 664325196.0, "step": 17412 }, { "epoch": 2.2151125810965526, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.77641296386719, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8537720441818237, "num_tokens": 664365722.0, "step": 17413 }, { "epoch": 2.215239791375143, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.37720489501953, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8528549671173096, "num_tokens": 664411570.0, "step": 17414 }, { "epoch": 2.2153670016537337, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.582923889160156, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8661934733390808, "num_tokens": 664453545.0, "step": 17415 }, { "epoch": 2.215494211932324, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.48983383178711, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8621401786804199, "num_tokens": 664489882.0, "step": 17416 }, { "epoch": 2.2156214222109147, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.44282150268555, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.862707257270813, "num_tokens": 664528720.0, "step": 17417 }, { "epoch": 2.2157486324895053, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.59005355834961, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8515071868896484, "num_tokens": 664569645.0, "step": 17418 }, { "epoch": 2.215875842768096, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.137149810791016, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8566763401031494, "num_tokens": 664612033.0, "step": 17419 }, { "epoch": 2.2160030530466863, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.14026641845703, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8698325157165527, "num_tokens": 664651408.0, "step": 17420 }, { "epoch": 2.216130263325277, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.458919525146484, "learning_rate": 1e-06, "loss": 0.6437, "mean_token_accuracy": 0.8483210206031799, "num_tokens": 664695764.0, "step": 17421 }, { "epoch": 2.2162574736038674, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.76716613769531, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8644090294837952, "num_tokens": 664735528.0, "step": 17422 }, { "epoch": 2.216384683882458, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.95166015625, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8537994623184204, "num_tokens": 664775612.0, "step": 17423 }, { "epoch": 2.216511894161048, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.43016052246094, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8587911128997803, "num_tokens": 664812485.0, "step": 17424 }, { "epoch": 2.2166391044396385, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.441200256347656, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8477177619934082, "num_tokens": 664853166.0, "step": 17425 }, { "epoch": 2.216766314718229, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.374839782714844, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8513236045837402, "num_tokens": 664888778.0, "step": 17426 }, { "epoch": 2.2168935249968196, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.59555435180664, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8600698709487915, "num_tokens": 664933783.0, "step": 17427 }, { "epoch": 2.21702073527541, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.563453674316406, "learning_rate": 1e-06, "loss": 0.6644, "mean_token_accuracy": 0.8446113467216492, "num_tokens": 664965228.0, "step": 17428 }, { "epoch": 2.2171479455540006, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.33740234375, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8590399026870728, "num_tokens": 664996691.0, "step": 17429 }, { "epoch": 2.217275155832591, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.8802490234375, "learning_rate": 1e-06, "loss": 0.5531, "mean_token_accuracy": 0.8737797737121582, "num_tokens": 665032006.0, "step": 17430 }, { "epoch": 2.2174023661111817, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.756832122802734, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8515980243682861, "num_tokens": 665071819.0, "step": 17431 }, { "epoch": 2.217529576389772, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.29150390625, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8496523499488831, "num_tokens": 665118883.0, "step": 17432 }, { "epoch": 2.2176567866683627, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.972660064697266, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8427252173423767, "num_tokens": 665154926.0, "step": 17433 }, { "epoch": 2.2177839969469533, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.626163482666016, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8693602085113525, "num_tokens": 665189910.0, "step": 17434 }, { "epoch": 2.217911207225544, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.33400344848633, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8695160150527954, "num_tokens": 665235849.0, "step": 17435 }, { "epoch": 2.2180384175041343, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.31955337524414, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8517866134643555, "num_tokens": 665274231.0, "step": 17436 }, { "epoch": 2.218165627782725, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.41805648803711, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8793156147003174, "num_tokens": 665309057.0, "step": 17437 }, { "epoch": 2.2182928380613154, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.18069839477539, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8569715619087219, "num_tokens": 665337031.0, "step": 17438 }, { "epoch": 2.218420048339906, "ewc_loss": 0.1767578125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015354156494140625, "grad_norm": 47.22157669067383, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8607277870178223, "num_tokens": 665376557.0, "step": 17439 }, { "epoch": 2.2185472586184964, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.09721755981445, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8615390062332153, "num_tokens": 665419928.0, "step": 17440 }, { "epoch": 2.218674468897087, "ewc_loss": 0.177734375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.79966735839844, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8695310354232788, "num_tokens": 665458827.0, "step": 17441 }, { "epoch": 2.2188016791756775, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.10972595214844, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.863860011100769, "num_tokens": 665498623.0, "step": 17442 }, { "epoch": 2.218928889454268, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.1200065612793, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8559338450431824, "num_tokens": 665536103.0, "step": 17443 }, { "epoch": 2.2190560997328586, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.58687973022461, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8590797185897827, "num_tokens": 665572089.0, "step": 17444 }, { "epoch": 2.219183310011449, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.49440383911133, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.853294849395752, "num_tokens": 665608088.0, "step": 17445 }, { "epoch": 2.2193105202900396, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.51692199707031, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8712372779846191, "num_tokens": 665641883.0, "step": 17446 }, { "epoch": 2.21943773056863, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.18865966796875, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8664535880088806, "num_tokens": 665682211.0, "step": 17447 }, { "epoch": 2.2195649408472207, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.462562561035156, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8665515780448914, "num_tokens": 665726182.0, "step": 17448 }, { "epoch": 2.2196921511258108, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.563148498535156, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8664072155952454, "num_tokens": 665760404.0, "step": 17449 }, { "epoch": 2.2198193614044013, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.902809143066406, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8601537942886353, "num_tokens": 665798235.0, "step": 17450 }, { "epoch": 2.219946571682992, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.24226760864258, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8629082441329956, "num_tokens": 665840118.0, "step": 17451 }, { "epoch": 2.2200737819615823, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.223663330078125, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8607177734375, "num_tokens": 665878498.0, "step": 17452 }, { "epoch": 2.220200992240173, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.07701110839844, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8576856255531311, "num_tokens": 665924179.0, "step": 17453 }, { "epoch": 2.2203282025187634, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.252716064453125, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8763846755027771, "num_tokens": 665963160.0, "step": 17454 }, { "epoch": 2.220455412797354, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.089393615722656, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8716669082641602, "num_tokens": 665998854.0, "step": 17455 }, { "epoch": 2.2205826230759445, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.6868896484375, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8703033924102783, "num_tokens": 666036420.0, "step": 17456 }, { "epoch": 2.220709833354535, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.855411529541016, "learning_rate": 1e-06, "loss": 0.6627, "mean_token_accuracy": 0.8409260511398315, "num_tokens": 666076692.0, "step": 17457 }, { "epoch": 2.2208370436331255, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.80484390258789, "learning_rate": 1e-06, "loss": 0.6502, "mean_token_accuracy": 0.847366213798523, "num_tokens": 666118009.0, "step": 17458 }, { "epoch": 2.220964253911716, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.729896545410156, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8523939251899719, "num_tokens": 666158132.0, "step": 17459 }, { "epoch": 2.2210914641903066, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.36875534057617, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8571567535400391, "num_tokens": 666187195.0, "step": 17460 }, { "epoch": 2.221218674468897, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.134605407714844, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8654367923736572, "num_tokens": 666227790.0, "step": 17461 }, { "epoch": 2.2213458847474876, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.032291412353516, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8552877902984619, "num_tokens": 666268517.0, "step": 17462 }, { "epoch": 2.221473095026078, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.35419845581055, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8789366483688354, "num_tokens": 666313727.0, "step": 17463 }, { "epoch": 2.2216003053046687, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.81216049194336, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8655927777290344, "num_tokens": 666352713.0, "step": 17464 }, { "epoch": 2.221727515583259, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.030765533447266, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8703195452690125, "num_tokens": 666395065.0, "step": 17465 }, { "epoch": 2.2218547258618497, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.07713317871094, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.86336350440979, "num_tokens": 666428019.0, "step": 17466 }, { "epoch": 2.2219819361404403, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.25848388671875, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8611114025115967, "num_tokens": 666466830.0, "step": 17467 }, { "epoch": 2.222109146419031, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.02800750732422, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8791792392730713, "num_tokens": 666501028.0, "step": 17468 }, { "epoch": 2.2222363566976213, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.263362884521484, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8750441670417786, "num_tokens": 666538554.0, "step": 17469 }, { "epoch": 2.222363566976212, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.98406982421875, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8611252307891846, "num_tokens": 666577019.0, "step": 17470 }, { "epoch": 2.2224907772548024, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.65073776245117, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8768457770347595, "num_tokens": 666616597.0, "step": 17471 }, { "epoch": 2.222617987533393, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.629058837890625, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8647696375846863, "num_tokens": 666651523.0, "step": 17472 }, { "epoch": 2.2227451978119834, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.430992126464844, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8515217900276184, "num_tokens": 666689270.0, "step": 17473 }, { "epoch": 2.2228724080905735, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 48.01400375366211, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8589454889297485, "num_tokens": 666726836.0, "step": 17474 }, { "epoch": 2.222999618369164, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.07308578491211, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.858557939529419, "num_tokens": 666761207.0, "step": 17475 }, { "epoch": 2.2231268286477546, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.460540771484375, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8702220916748047, "num_tokens": 666800293.0, "step": 17476 }, { "epoch": 2.223254038926345, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.70747375488281, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8713322877883911, "num_tokens": 666843046.0, "step": 17477 }, { "epoch": 2.2233812492049356, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.466590881347656, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8760388493537903, "num_tokens": 666885840.0, "step": 17478 }, { "epoch": 2.223508459483526, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.03766632080078, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8631304502487183, "num_tokens": 666925654.0, "step": 17479 }, { "epoch": 2.2236356697621167, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.268821716308594, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8635237216949463, "num_tokens": 666968098.0, "step": 17480 }, { "epoch": 2.223762880040707, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.8492546081543, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.873873233795166, "num_tokens": 667010349.0, "step": 17481 }, { "epoch": 2.2238900903192977, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.443050384521484, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8528964519500732, "num_tokens": 667047086.0, "step": 17482 }, { "epoch": 2.2240173005978883, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.90037155151367, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.8453405499458313, "num_tokens": 667083955.0, "step": 17483 }, { "epoch": 2.224144510876479, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.35435104370117, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8676141500473022, "num_tokens": 667119751.0, "step": 17484 }, { "epoch": 2.2242717211550693, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.897464752197266, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8567686080932617, "num_tokens": 667150980.0, "step": 17485 }, { "epoch": 2.22439893143366, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.33776092529297, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8585838079452515, "num_tokens": 667185198.0, "step": 17486 }, { "epoch": 2.2245261417122504, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.011940002441406, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8641316890716553, "num_tokens": 667225286.0, "step": 17487 }, { "epoch": 2.224653351990841, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.50119400024414, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8565609455108643, "num_tokens": 667266294.0, "step": 17488 }, { "epoch": 2.2247805622694314, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.63644790649414, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8613473176956177, "num_tokens": 667302430.0, "step": 17489 }, { "epoch": 2.224907772548022, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.00448989868164, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8557137250900269, "num_tokens": 667342881.0, "step": 17490 }, { "epoch": 2.2250349828266125, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.24285125732422, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8633155226707458, "num_tokens": 667376122.0, "step": 17491 }, { "epoch": 2.225162193105203, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.175575256347656, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8535283803939819, "num_tokens": 667415120.0, "step": 17492 }, { "epoch": 2.2252894033837936, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.11009979248047, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8598630428314209, "num_tokens": 667451014.0, "step": 17493 }, { "epoch": 2.225416613662384, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.94951629638672, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8530627489089966, "num_tokens": 667485600.0, "step": 17494 }, { "epoch": 2.2255438239409746, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.703697204589844, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8570119142532349, "num_tokens": 667527561.0, "step": 17495 }, { "epoch": 2.225671034219565, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.32545852661133, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8698388934135437, "num_tokens": 667569677.0, "step": 17496 }, { "epoch": 2.225798244498155, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.50527572631836, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8611024618148804, "num_tokens": 667605954.0, "step": 17497 }, { "epoch": 2.225925454776746, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.36678695678711, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8647165298461914, "num_tokens": 667645082.0, "step": 17498 }, { "epoch": 2.2260526650553363, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.298065185546875, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8597127795219421, "num_tokens": 667681390.0, "step": 17499 }, { "epoch": 2.226179875333927, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.60780334472656, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8522782325744629, "num_tokens": 667722772.0, "step": 17500 }, { "epoch": 2.2263070856125173, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.695274353027344, "learning_rate": 1e-06, "loss": 0.6782, "mean_token_accuracy": 0.8371574878692627, "num_tokens": 667758647.0, "step": 17501 }, { "epoch": 2.226434295891108, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.588741302490234, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8621646761894226, "num_tokens": 667790625.0, "step": 17502 }, { "epoch": 2.2265615061696984, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.292415618896484, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8514978289604187, "num_tokens": 667823724.0, "step": 17503 }, { "epoch": 2.226688716448289, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.89366912841797, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8751137256622314, "num_tokens": 667862548.0, "step": 17504 }, { "epoch": 2.2268159267268794, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.80671691894531, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.854755163192749, "num_tokens": 667905994.0, "step": 17505 }, { "epoch": 2.22694313700547, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.68125534057617, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8554157018661499, "num_tokens": 667945087.0, "step": 17506 }, { "epoch": 2.2270703472840605, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.981422424316406, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.87161785364151, "num_tokens": 667984736.0, "step": 17507 }, { "epoch": 2.227197557562651, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.5694580078125, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8566216230392456, "num_tokens": 668024020.0, "step": 17508 }, { "epoch": 2.2273247678412416, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.16496276855469, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8567330837249756, "num_tokens": 668059567.0, "step": 17509 }, { "epoch": 2.227451978119832, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.78073501586914, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8677115440368652, "num_tokens": 668099901.0, "step": 17510 }, { "epoch": 2.2275791883984226, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.79048538208008, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8758894205093384, "num_tokens": 668135974.0, "step": 17511 }, { "epoch": 2.227706398677013, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.143226623535156, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8654424548149109, "num_tokens": 668179200.0, "step": 17512 }, { "epoch": 2.2278336089556037, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.87604904174805, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.838765561580658, "num_tokens": 668219683.0, "step": 17513 }, { "epoch": 2.227960819234194, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.111053466796875, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8630490303039551, "num_tokens": 668253899.0, "step": 17514 }, { "epoch": 2.2280880295127847, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.85427474975586, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8550351858139038, "num_tokens": 668294140.0, "step": 17515 }, { "epoch": 2.2282152397913753, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.0385627746582, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8685398101806641, "num_tokens": 668328442.0, "step": 17516 }, { "epoch": 2.228342450069966, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.91214370727539, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8636211156845093, "num_tokens": 668367903.0, "step": 17517 }, { "epoch": 2.2284696603485563, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.86911392211914, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8742178082466125, "num_tokens": 668409889.0, "step": 17518 }, { "epoch": 2.228596870627147, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.88028335571289, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8662247061729431, "num_tokens": 668451061.0, "step": 17519 }, { "epoch": 2.2287240809057374, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.158565521240234, "learning_rate": 1e-06, "loss": 0.6547, "mean_token_accuracy": 0.8405964374542236, "num_tokens": 668493038.0, "step": 17520 }, { "epoch": 2.228851291184328, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.539772033691406, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8690750002861023, "num_tokens": 668528087.0, "step": 17521 }, { "epoch": 2.228978501462918, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.647239685058594, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8625807762145996, "num_tokens": 668562325.0, "step": 17522 }, { "epoch": 2.2291057117415085, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.130062103271484, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8644565939903259, "num_tokens": 668601336.0, "step": 17523 }, { "epoch": 2.229232922020099, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.84407043457031, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8817237615585327, "num_tokens": 668637113.0, "step": 17524 }, { "epoch": 2.2293601322986896, "ewc_loss": 0.1787109375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015544891357421875, "grad_norm": 47.24976348876953, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.856961190700531, "num_tokens": 668675645.0, "step": 17525 }, { "epoch": 2.22948734257728, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.73394012451172, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8614468574523926, "num_tokens": 668706480.0, "step": 17526 }, { "epoch": 2.2296145528558706, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.360721588134766, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8577443361282349, "num_tokens": 668743956.0, "step": 17527 }, { "epoch": 2.229741763134461, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.99868392944336, "learning_rate": 1e-06, "loss": 0.5542, "mean_token_accuracy": 0.8708961009979248, "num_tokens": 668779134.0, "step": 17528 }, { "epoch": 2.2298689734130517, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.25919723510742, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8649136424064636, "num_tokens": 668814978.0, "step": 17529 }, { "epoch": 2.229996183691642, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.55823516845703, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8453713655471802, "num_tokens": 668854194.0, "step": 17530 }, { "epoch": 2.2301233939702327, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.48087692260742, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.853114902973175, "num_tokens": 668893095.0, "step": 17531 }, { "epoch": 2.2302506042488233, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.534725189208984, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8701643943786621, "num_tokens": 668930626.0, "step": 17532 }, { "epoch": 2.230377814527414, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.093196868896484, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8695155382156372, "num_tokens": 668960941.0, "step": 17533 }, { "epoch": 2.2305050248060043, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.99653244018555, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8637261390686035, "num_tokens": 668997771.0, "step": 17534 }, { "epoch": 2.230632235084595, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.6669921875, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8765521049499512, "num_tokens": 669038714.0, "step": 17535 }, { "epoch": 2.2307594453631854, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.03720474243164, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8693534135818481, "num_tokens": 669072336.0, "step": 17536 }, { "epoch": 2.230886655641776, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.98780822753906, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8709778785705566, "num_tokens": 669110303.0, "step": 17537 }, { "epoch": 2.2310138659203664, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.358673095703125, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8531767129898071, "num_tokens": 669146874.0, "step": 17538 }, { "epoch": 2.231141076198957, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.75328063964844, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8744120597839355, "num_tokens": 669181682.0, "step": 17539 }, { "epoch": 2.2312682864775475, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.41914749145508, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.86342453956604, "num_tokens": 669227201.0, "step": 17540 }, { "epoch": 2.231395496756138, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.033226013183594, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8640917539596558, "num_tokens": 669265915.0, "step": 17541 }, { "epoch": 2.2315227070347285, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.8668212890625, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8666784763336182, "num_tokens": 669304130.0, "step": 17542 }, { "epoch": 2.231649917313319, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.146141052246094, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8593342304229736, "num_tokens": 669344225.0, "step": 17543 }, { "epoch": 2.2317771275919096, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.94957733154297, "learning_rate": 1e-06, "loss": 0.65, "mean_token_accuracy": 0.8435878157615662, "num_tokens": 669385126.0, "step": 17544 }, { "epoch": 2.2319043378705, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.65517044067383, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8512004613876343, "num_tokens": 669425774.0, "step": 17545 }, { "epoch": 2.2320315481490907, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.52373504638672, "learning_rate": 1e-06, "loss": 0.6315, "mean_token_accuracy": 0.8518983125686646, "num_tokens": 669459789.0, "step": 17546 }, { "epoch": 2.2321587584276807, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.83119583129883, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.877577543258667, "num_tokens": 669498709.0, "step": 17547 }, { "epoch": 2.2322859687062713, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.80914306640625, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.86725252866745, "num_tokens": 669536021.0, "step": 17548 }, { "epoch": 2.232413178984862, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.37630844116211, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8483821153640747, "num_tokens": 669574316.0, "step": 17549 }, { "epoch": 2.2325403892634523, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.27238082885742, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8529269099235535, "num_tokens": 669611826.0, "step": 17550 }, { "epoch": 2.232667599542043, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.31282043457031, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8593456745147705, "num_tokens": 669655277.0, "step": 17551 }, { "epoch": 2.2327948098206334, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.203739166259766, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8565298914909363, "num_tokens": 669693481.0, "step": 17552 }, { "epoch": 2.232922020099224, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.734317779541016, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8610763549804688, "num_tokens": 669724104.0, "step": 17553 }, { "epoch": 2.2330492303778144, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.78450012207031, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8450264930725098, "num_tokens": 669768725.0, "step": 17554 }, { "epoch": 2.233176440656405, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.32678985595703, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8714301586151123, "num_tokens": 669809508.0, "step": 17555 }, { "epoch": 2.2333036509349955, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.63811111450195, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8634409308433533, "num_tokens": 669848047.0, "step": 17556 }, { "epoch": 2.233430861213586, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.738529205322266, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.8788235187530518, "num_tokens": 669881274.0, "step": 17557 }, { "epoch": 2.2335580714921766, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.00788116455078, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.856269359588623, "num_tokens": 669918517.0, "step": 17558 }, { "epoch": 2.233685281770767, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.599143981933594, "learning_rate": 1e-06, "loss": 0.6329, "mean_token_accuracy": 0.8541330099105835, "num_tokens": 669960990.0, "step": 17559 }, { "epoch": 2.2338124920493576, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.2120475769043, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8652337193489075, "num_tokens": 669995542.0, "step": 17560 }, { "epoch": 2.233939702327948, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.412452697753906, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8626936078071594, "num_tokens": 670036798.0, "step": 17561 }, { "epoch": 2.2340669126065387, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.08460998535156, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8607373237609863, "num_tokens": 670071763.0, "step": 17562 }, { "epoch": 2.234194122885129, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.836524963378906, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8610705137252808, "num_tokens": 670100864.0, "step": 17563 }, { "epoch": 2.2343213331637197, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.784812927246094, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8508151769638062, "num_tokens": 670144114.0, "step": 17564 }, { "epoch": 2.2344485434423103, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.28642272949219, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8654526472091675, "num_tokens": 670182719.0, "step": 17565 }, { "epoch": 2.234575753720901, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.719947814941406, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8659708499908447, "num_tokens": 670218059.0, "step": 17566 }, { "epoch": 2.2347029639994913, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.197471618652344, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8582835793495178, "num_tokens": 670259589.0, "step": 17567 }, { "epoch": 2.234830174278082, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.15576934814453, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8532934188842773, "num_tokens": 670301755.0, "step": 17568 }, { "epoch": 2.2349573845566724, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.87824249267578, "learning_rate": 1e-06, "loss": 0.5687, "mean_token_accuracy": 0.8663946390151978, "num_tokens": 670339205.0, "step": 17569 }, { "epoch": 2.235084594835263, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.335628509521484, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8650798797607422, "num_tokens": 670382961.0, "step": 17570 }, { "epoch": 2.2352118051138534, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.88627624511719, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8517951965332031, "num_tokens": 670420747.0, "step": 17571 }, { "epoch": 2.2353390153924435, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.9585075378418, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8796593546867371, "num_tokens": 670457906.0, "step": 17572 }, { "epoch": 2.235466225671034, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.236122131347656, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.851744532585144, "num_tokens": 670493187.0, "step": 17573 }, { "epoch": 2.2355934359496246, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.07841110229492, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8761515617370605, "num_tokens": 670529750.0, "step": 17574 }, { "epoch": 2.235720646228215, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.64611053466797, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8593243360519409, "num_tokens": 670570147.0, "step": 17575 }, { "epoch": 2.2358478565068056, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.96075439453125, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8626991510391235, "num_tokens": 670604113.0, "step": 17576 }, { "epoch": 2.235975066785396, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.94538497924805, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8634864091873169, "num_tokens": 670645451.0, "step": 17577 }, { "epoch": 2.2361022770639867, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.8394889831543, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8671107292175293, "num_tokens": 670684055.0, "step": 17578 }, { "epoch": 2.236229487342577, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.3316650390625, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8541097044944763, "num_tokens": 670724397.0, "step": 17579 }, { "epoch": 2.2363566976211677, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.61777877807617, "learning_rate": 1e-06, "loss": 0.546, "mean_token_accuracy": 0.873874306678772, "num_tokens": 670761282.0, "step": 17580 }, { "epoch": 2.2364839078997583, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.55377960205078, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8660862445831299, "num_tokens": 670800007.0, "step": 17581 }, { "epoch": 2.236611118178349, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.647769927978516, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8603280782699585, "num_tokens": 670840040.0, "step": 17582 }, { "epoch": 2.2367383284569393, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.76567077636719, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8557720184326172, "num_tokens": 670881193.0, "step": 17583 }, { "epoch": 2.23686553873553, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.40003967285156, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8679900169372559, "num_tokens": 670927275.0, "step": 17584 }, { "epoch": 2.2369927490141204, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.50310134887695, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8659284114837646, "num_tokens": 670967327.0, "step": 17585 }, { "epoch": 2.237119959292711, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.79103088378906, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8668997287750244, "num_tokens": 671009250.0, "step": 17586 }, { "epoch": 2.2372471695713014, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.60609436035156, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8647556900978088, "num_tokens": 671045068.0, "step": 17587 }, { "epoch": 2.237374379849892, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.7818489074707, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8495464324951172, "num_tokens": 671084786.0, "step": 17588 }, { "epoch": 2.2375015901284825, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.08163833618164, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8538588285446167, "num_tokens": 671123235.0, "step": 17589 }, { "epoch": 2.237628800407073, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.07601547241211, "learning_rate": 1e-06, "loss": 0.5437, "mean_token_accuracy": 0.8776040077209473, "num_tokens": 671161702.0, "step": 17590 }, { "epoch": 2.2377560106856635, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.44135665893555, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.873232364654541, "num_tokens": 671195487.0, "step": 17591 }, { "epoch": 2.237883220964254, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.12618637084961, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8608344793319702, "num_tokens": 671232288.0, "step": 17592 }, { "epoch": 2.2380104312428446, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.27653884887695, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8463056087493896, "num_tokens": 671275154.0, "step": 17593 }, { "epoch": 2.238137641521435, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.754112243652344, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8520143032073975, "num_tokens": 671312266.0, "step": 17594 }, { "epoch": 2.238264851800025, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.72687911987305, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8621553778648376, "num_tokens": 671349690.0, "step": 17595 }, { "epoch": 2.238392062078616, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.39697265625, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.864548921585083, "num_tokens": 671387898.0, "step": 17596 }, { "epoch": 2.2385192723572063, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.23724365234375, "learning_rate": 1e-06, "loss": 0.6632, "mean_token_accuracy": 0.8405742645263672, "num_tokens": 671428054.0, "step": 17597 }, { "epoch": 2.238646482635797, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.89522933959961, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8681687712669373, "num_tokens": 671467096.0, "step": 17598 }, { "epoch": 2.2387736929143873, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.89605712890625, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8623143434524536, "num_tokens": 671506752.0, "step": 17599 }, { "epoch": 2.238900903192978, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.3541374206543, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8541966676712036, "num_tokens": 671542403.0, "step": 17600 }, { "epoch": 2.2390281134715684, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.74286651611328, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8724807500839233, "num_tokens": 671578115.0, "step": 17601 }, { "epoch": 2.239155323750159, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.21871566772461, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8744493126869202, "num_tokens": 671619854.0, "step": 17602 }, { "epoch": 2.2392825340287494, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.94988250732422, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.876028835773468, "num_tokens": 671661617.0, "step": 17603 }, { "epoch": 2.23940974430734, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.002559661865234, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8508849143981934, "num_tokens": 671702231.0, "step": 17604 }, { "epoch": 2.2395369545859305, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.52946853637695, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8630872368812561, "num_tokens": 671741483.0, "step": 17605 }, { "epoch": 2.239664164864521, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 48.82771301269531, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8568845987319946, "num_tokens": 671780168.0, "step": 17606 }, { "epoch": 2.2397913751431116, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.087303161621094, "learning_rate": 1e-06, "loss": 0.659, "mean_token_accuracy": 0.8385902047157288, "num_tokens": 671818980.0, "step": 17607 }, { "epoch": 2.239918585421702, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.76486587524414, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8602320551872253, "num_tokens": 671850010.0, "step": 17608 }, { "epoch": 2.2400457957002926, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.13353729248047, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8555290699005127, "num_tokens": 671888506.0, "step": 17609 }, { "epoch": 2.240173005978883, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.95914840698242, "learning_rate": 1e-06, "loss": 0.5716, "mean_token_accuracy": 0.8691821694374084, "num_tokens": 671930965.0, "step": 17610 }, { "epoch": 2.2403002162574737, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.91468811035156, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8709403872489929, "num_tokens": 671967472.0, "step": 17611 }, { "epoch": 2.240427426536064, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.228233337402344, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8843678832054138, "num_tokens": 672005986.0, "step": 17612 }, { "epoch": 2.2405546368146547, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 48.617122650146484, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8613303899765015, "num_tokens": 672044729.0, "step": 17613 }, { "epoch": 2.2406818470932452, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.88381576538086, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.862238347530365, "num_tokens": 672080141.0, "step": 17614 }, { "epoch": 2.2408090573718358, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.020591735839844, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8696622848510742, "num_tokens": 672119277.0, "step": 17615 }, { "epoch": 2.2409362676504263, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 48.115875244140625, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.860257625579834, "num_tokens": 672154205.0, "step": 17616 }, { "epoch": 2.241063477929017, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.89704513549805, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8626883029937744, "num_tokens": 672193667.0, "step": 17617 }, { "epoch": 2.2411906882076074, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.987701416015625, "learning_rate": 1e-06, "loss": 0.6912, "mean_token_accuracy": 0.8316167593002319, "num_tokens": 672232360.0, "step": 17618 }, { "epoch": 2.241317898486198, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.56847381591797, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.8484606146812439, "num_tokens": 672264127.0, "step": 17619 }, { "epoch": 2.241445108764788, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.716556549072266, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8699609637260437, "num_tokens": 672300719.0, "step": 17620 }, { "epoch": 2.2415723190433785, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.089149475097656, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8721871972084045, "num_tokens": 672342796.0, "step": 17621 }, { "epoch": 2.241699529321969, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.73994064331055, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8636080622673035, "num_tokens": 672382597.0, "step": 17622 }, { "epoch": 2.2418267396005596, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.114532470703125, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8497545719146729, "num_tokens": 672427350.0, "step": 17623 }, { "epoch": 2.24195394987915, "ewc_loss": 0.1806640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015735626220703125, "grad_norm": 47.467323303222656, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8550022840499878, "num_tokens": 672468997.0, "step": 17624 }, { "epoch": 2.2420811601577406, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.07568359375, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8678831458091736, "num_tokens": 672505061.0, "step": 17625 }, { "epoch": 2.242208370436331, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.82268142700195, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8613073229789734, "num_tokens": 672544241.0, "step": 17626 }, { "epoch": 2.2423355807149217, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.052833557128906, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8613020181655884, "num_tokens": 672586334.0, "step": 17627 }, { "epoch": 2.242462790993512, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.96974182128906, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8580160737037659, "num_tokens": 672628627.0, "step": 17628 }, { "epoch": 2.2425900012721027, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.46879196166992, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8523118495941162, "num_tokens": 672663046.0, "step": 17629 }, { "epoch": 2.2427172115506933, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.952301025390625, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8687337040901184, "num_tokens": 672703394.0, "step": 17630 }, { "epoch": 2.242844421829284, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.14605712890625, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8525605201721191, "num_tokens": 672743262.0, "step": 17631 }, { "epoch": 2.2429716321078743, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.76514434814453, "learning_rate": 1e-06, "loss": 0.695, "mean_token_accuracy": 0.8264359831809998, "num_tokens": 672782662.0, "step": 17632 }, { "epoch": 2.243098842386465, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.71297836303711, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8592836856842041, "num_tokens": 672818400.0, "step": 17633 }, { "epoch": 2.2432260526650554, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.8320426940918, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.873228907585144, "num_tokens": 672858843.0, "step": 17634 }, { "epoch": 2.243353262943646, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.999637603759766, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8649857044219971, "num_tokens": 672900397.0, "step": 17635 }, { "epoch": 2.2434804732222364, "ewc_loss": 0.1796875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000156402587890625, "grad_norm": 47.91773223876953, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8603755235671997, "num_tokens": 672936025.0, "step": 17636 }, { "epoch": 2.243607683500827, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.01734924316406, "learning_rate": 1e-06, "loss": 0.6579, "mean_token_accuracy": 0.8421876430511475, "num_tokens": 672979180.0, "step": 17637 }, { "epoch": 2.2437348937794175, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.83761978149414, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8725020885467529, "num_tokens": 673018053.0, "step": 17638 }, { "epoch": 2.243862104058008, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.03867721557617, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8401274681091309, "num_tokens": 673057596.0, "step": 17639 }, { "epoch": 2.2439893143365985, "ewc_loss": 0.181640625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001583099365234375, "grad_norm": 47.85910415649414, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8720165491104126, "num_tokens": 673097740.0, "step": 17640 }, { "epoch": 2.244116524615189, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.91755294799805, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8665403127670288, "num_tokens": 673141821.0, "step": 17641 }, { "epoch": 2.2442437348937796, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.03505325317383, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8704942464828491, "num_tokens": 673184279.0, "step": 17642 }, { "epoch": 2.24437094517237, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.13698959350586, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8709839582443237, "num_tokens": 673224328.0, "step": 17643 }, { "epoch": 2.2444981554509607, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.38916778564453, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.8486340641975403, "num_tokens": 673263328.0, "step": 17644 }, { "epoch": 2.2446253657295507, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.29943084716797, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.866493284702301, "num_tokens": 673301663.0, "step": 17645 }, { "epoch": 2.2447525760081413, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.28422927856445, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8587406277656555, "num_tokens": 673340606.0, "step": 17646 }, { "epoch": 2.244879786286732, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.31227111816406, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8574867844581604, "num_tokens": 673382201.0, "step": 17647 }, { "epoch": 2.2450069965653223, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.00320053100586, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8711707592010498, "num_tokens": 673423330.0, "step": 17648 }, { "epoch": 2.245134206843913, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.72140884399414, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8786787986755371, "num_tokens": 673459690.0, "step": 17649 }, { "epoch": 2.2452614171225034, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.982215881347656, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8585423231124878, "num_tokens": 673494827.0, "step": 17650 }, { "epoch": 2.245388627401094, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.5321044921875, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.8506835699081421, "num_tokens": 673540568.0, "step": 17651 }, { "epoch": 2.2455158376796844, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.492088317871094, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8774688243865967, "num_tokens": 673574022.0, "step": 17652 }, { "epoch": 2.245643047958275, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.45225143432617, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8579736351966858, "num_tokens": 673612493.0, "step": 17653 }, { "epoch": 2.2457702582368655, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.88007736206055, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8586305379867554, "num_tokens": 673653729.0, "step": 17654 }, { "epoch": 2.245897468515456, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.40673828125, "learning_rate": 1e-06, "loss": 0.5556, "mean_token_accuracy": 0.8745852112770081, "num_tokens": 673695626.0, "step": 17655 }, { "epoch": 2.2460246787940465, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.041236877441406, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8666497468948364, "num_tokens": 673732188.0, "step": 17656 }, { "epoch": 2.246151889072637, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.046234130859375, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8536830544471741, "num_tokens": 673770259.0, "step": 17657 }, { "epoch": 2.2462790993512276, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.1650505065918, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8586602210998535, "num_tokens": 673805018.0, "step": 17658 }, { "epoch": 2.246406309629818, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.2726936340332, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8542004823684692, "num_tokens": 673846058.0, "step": 17659 }, { "epoch": 2.2465335199084087, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.97312545776367, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.8500955104827881, "num_tokens": 673886437.0, "step": 17660 }, { "epoch": 2.246660730186999, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.24723434448242, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8665074110031128, "num_tokens": 673925701.0, "step": 17661 }, { "epoch": 2.2467879404655897, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.90553283691406, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8657556772232056, "num_tokens": 673969854.0, "step": 17662 }, { "epoch": 2.2469151507441802, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.3900032043457, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8635076880455017, "num_tokens": 674005048.0, "step": 17663 }, { "epoch": 2.2470423610227708, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.160728454589844, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8568180203437805, "num_tokens": 674046548.0, "step": 17664 }, { "epoch": 2.2471695713013613, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.193702697753906, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8564376831054688, "num_tokens": 674086905.0, "step": 17665 }, { "epoch": 2.247296781579952, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.14524459838867, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8710840940475464, "num_tokens": 674124589.0, "step": 17666 }, { "epoch": 2.2474239918585424, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.89476776123047, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8746577501296997, "num_tokens": 674165135.0, "step": 17667 }, { "epoch": 2.247551202137133, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.58989334106445, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8574007749557495, "num_tokens": 674207269.0, "step": 17668 }, { "epoch": 2.2476784124157234, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.983917236328125, "learning_rate": 1e-06, "loss": 0.5324, "mean_token_accuracy": 0.8786604404449463, "num_tokens": 674240806.0, "step": 17669 }, { "epoch": 2.2478056226943135, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.09429931640625, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8525303602218628, "num_tokens": 674274697.0, "step": 17670 }, { "epoch": 2.247932832972904, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.30914306640625, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8535946011543274, "num_tokens": 674311362.0, "step": 17671 }, { "epoch": 2.2480600432514946, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.236595153808594, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8683658838272095, "num_tokens": 674352505.0, "step": 17672 }, { "epoch": 2.248187253530085, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.99739074707031, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.8668248653411865, "num_tokens": 674383938.0, "step": 17673 }, { "epoch": 2.2483144638086756, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.281795501708984, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8595702648162842, "num_tokens": 674421665.0, "step": 17674 }, { "epoch": 2.248441674087266, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.93113327026367, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8648421764373779, "num_tokens": 674466065.0, "step": 17675 }, { "epoch": 2.2485688843658567, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.39752197265625, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8637840747833252, "num_tokens": 674506868.0, "step": 17676 }, { "epoch": 2.248696094644447, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.36777114868164, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8566057682037354, "num_tokens": 674545756.0, "step": 17677 }, { "epoch": 2.2488233049230377, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.622962951660156, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8573817014694214, "num_tokens": 674590680.0, "step": 17678 }, { "epoch": 2.2489505152016283, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.77335739135742, "learning_rate": 1e-06, "loss": 0.5624, "mean_token_accuracy": 0.868297815322876, "num_tokens": 674631624.0, "step": 17679 }, { "epoch": 2.249077725480219, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.16510009765625, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8562499284744263, "num_tokens": 674667365.0, "step": 17680 }, { "epoch": 2.2492049357588093, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.5937385559082, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8616102933883667, "num_tokens": 674706587.0, "step": 17681 }, { "epoch": 2.2493321460374, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.627037048339844, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8603234887123108, "num_tokens": 674744480.0, "step": 17682 }, { "epoch": 2.2494593563159904, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.96955108642578, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8631887435913086, "num_tokens": 674783104.0, "step": 17683 }, { "epoch": 2.249586566594581, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.96977233886719, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8531689643859863, "num_tokens": 674823488.0, "step": 17684 }, { "epoch": 2.2497137768731714, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.743106842041016, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8666110038757324, "num_tokens": 674857322.0, "step": 17685 }, { "epoch": 2.249840987151762, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 48.08418273925781, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.8530908823013306, "num_tokens": 674901566.0, "step": 17686 }, { "epoch": 2.2499681974303525, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.45868682861328, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8672276735305786, "num_tokens": 674942803.0, "step": 17687 }, { "epoch": 2.250095407708943, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.77495574951172, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8519083261489868, "num_tokens": 674983160.0, "step": 17688 }, { "epoch": 2.2502226179875335, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.77243423461914, "learning_rate": 1e-06, "loss": 0.59, "mean_token_accuracy": 0.8623908758163452, "num_tokens": 675021243.0, "step": 17689 }, { "epoch": 2.250349828266124, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.98905563354492, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8582836389541626, "num_tokens": 675059435.0, "step": 17690 }, { "epoch": 2.2504770385447146, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.38918685913086, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8664806485176086, "num_tokens": 675091689.0, "step": 17691 }, { "epoch": 2.250604248823305, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.01240539550781, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8565400838851929, "num_tokens": 675130368.0, "step": 17692 }, { "epoch": 2.250731459101895, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.39944076538086, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8651800155639648, "num_tokens": 675171745.0, "step": 17693 }, { "epoch": 2.250858669380486, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.15909957885742, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8596605062484741, "num_tokens": 675206052.0, "step": 17694 }, { "epoch": 2.2509858796590763, "ewc_loss": 0.1826171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00015926361083984375, "grad_norm": 47.832481384277344, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8553861975669861, "num_tokens": 675243231.0, "step": 17695 }, { "epoch": 2.251113089937667, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.20100402832031, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8551273345947266, "num_tokens": 675273582.0, "step": 17696 }, { "epoch": 2.2512403002162573, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.540130615234375, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8593833446502686, "num_tokens": 675314247.0, "step": 17697 }, { "epoch": 2.251367510494848, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.622344970703125, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.863315999507904, "num_tokens": 675355675.0, "step": 17698 }, { "epoch": 2.2514947207734384, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.29259490966797, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8586679697036743, "num_tokens": 675394290.0, "step": 17699 }, { "epoch": 2.251621931052029, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.483097076416016, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.865037202835083, "num_tokens": 675434251.0, "step": 17700 }, { "epoch": 2.2517491413306194, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.19070053100586, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8671613931655884, "num_tokens": 675474731.0, "step": 17701 }, { "epoch": 2.25187635160921, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.03278350830078, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8565767407417297, "num_tokens": 675515180.0, "step": 17702 }, { "epoch": 2.2520035618878005, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.655887603759766, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.850202202796936, "num_tokens": 675553429.0, "step": 17703 }, { "epoch": 2.252130772166391, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.08279800415039, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8654912114143372, "num_tokens": 675592919.0, "step": 17704 }, { "epoch": 2.2522579824449815, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.49928665161133, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8705920577049255, "num_tokens": 675627528.0, "step": 17705 }, { "epoch": 2.252385192723572, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.2460823059082, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8716246485710144, "num_tokens": 675663721.0, "step": 17706 }, { "epoch": 2.2525124030021626, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.138912200927734, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8553283214569092, "num_tokens": 675705187.0, "step": 17707 }, { "epoch": 2.252639613280753, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.2893180847168, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8724801540374756, "num_tokens": 675752713.0, "step": 17708 }, { "epoch": 2.2527668235593437, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.10331726074219, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8621097803115845, "num_tokens": 675791009.0, "step": 17709 }, { "epoch": 2.252894033837934, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.48099899291992, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8645192384719849, "num_tokens": 675828890.0, "step": 17710 }, { "epoch": 2.2530212441165247, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.88903045654297, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8631143569946289, "num_tokens": 675862855.0, "step": 17711 }, { "epoch": 2.2531484543951152, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.49540710449219, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8608700633049011, "num_tokens": 675903032.0, "step": 17712 }, { "epoch": 2.2532756646737058, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.67179489135742, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8566350936889648, "num_tokens": 675943035.0, "step": 17713 }, { "epoch": 2.2534028749522963, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.51325988769531, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8640881776809692, "num_tokens": 675983073.0, "step": 17714 }, { "epoch": 2.253530085230887, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.96631622314453, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8672020435333252, "num_tokens": 676022341.0, "step": 17715 }, { "epoch": 2.2536572955094774, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.40736389160156, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8583865165710449, "num_tokens": 676060907.0, "step": 17716 }, { "epoch": 2.253784505788068, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.73140335083008, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8589444160461426, "num_tokens": 676100693.0, "step": 17717 }, { "epoch": 2.253911716066658, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.21783447265625, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8771349191665649, "num_tokens": 676137526.0, "step": 17718 }, { "epoch": 2.254038926345249, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.04240036010742, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8641918897628784, "num_tokens": 676177245.0, "step": 17719 }, { "epoch": 2.254166136623839, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.059913635253906, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8688738346099854, "num_tokens": 676214568.0, "step": 17720 }, { "epoch": 2.2542933469024296, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.97758865356445, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8624374866485596, "num_tokens": 676252552.0, "step": 17721 }, { "epoch": 2.25442055718102, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.89140319824219, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.843897819519043, "num_tokens": 676290480.0, "step": 17722 }, { "epoch": 2.2545477674596106, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.19607925415039, "learning_rate": 1e-06, "loss": 0.5516, "mean_token_accuracy": 0.875086784362793, "num_tokens": 676324950.0, "step": 17723 }, { "epoch": 2.254674977738201, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.1685791015625, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8550727367401123, "num_tokens": 676363851.0, "step": 17724 }, { "epoch": 2.2548021880167917, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.25436782836914, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8703618049621582, "num_tokens": 676399920.0, "step": 17725 }, { "epoch": 2.254929398295382, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.91062927246094, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8593183755874634, "num_tokens": 676443243.0, "step": 17726 }, { "epoch": 2.2550566085739727, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.68216323852539, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8481985330581665, "num_tokens": 676480272.0, "step": 17727 }, { "epoch": 2.2551838188525632, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.615055084228516, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8584698438644409, "num_tokens": 676516630.0, "step": 17728 }, { "epoch": 2.2553110291311538, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.87554931640625, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8667865991592407, "num_tokens": 676557842.0, "step": 17729 }, { "epoch": 2.2554382394097443, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.63983154296875, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8718500733375549, "num_tokens": 676599413.0, "step": 17730 }, { "epoch": 2.255565449688335, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.65786361694336, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8534789085388184, "num_tokens": 676637791.0, "step": 17731 }, { "epoch": 2.2556926599669254, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.61752700805664, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8565706014633179, "num_tokens": 676677108.0, "step": 17732 }, { "epoch": 2.255819870245516, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.4565544128418, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8659120202064514, "num_tokens": 676720698.0, "step": 17733 }, { "epoch": 2.2559470805241064, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.16739273071289, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.8492337465286255, "num_tokens": 676763719.0, "step": 17734 }, { "epoch": 2.256074290802697, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.981327056884766, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8621386289596558, "num_tokens": 676801083.0, "step": 17735 }, { "epoch": 2.2562015010812875, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 48.14674758911133, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8536110520362854, "num_tokens": 676840861.0, "step": 17736 }, { "epoch": 2.256328711359878, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.394161224365234, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8746622800827026, "num_tokens": 676876864.0, "step": 17737 }, { "epoch": 2.2564559216384685, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.124691009521484, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8813750743865967, "num_tokens": 676920983.0, "step": 17738 }, { "epoch": 2.256583131917059, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.5581169128418, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.871122419834137, "num_tokens": 676961294.0, "step": 17739 }, { "epoch": 2.2567103421956496, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.674468994140625, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8549879789352417, "num_tokens": 676996318.0, "step": 17740 }, { "epoch": 2.2568375524742397, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.62472915649414, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8639997839927673, "num_tokens": 677034083.0, "step": 17741 }, { "epoch": 2.2569647627528306, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.39180374145508, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8634114265441895, "num_tokens": 677071206.0, "step": 17742 }, { "epoch": 2.2570919730314207, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.565303802490234, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8668103218078613, "num_tokens": 677103573.0, "step": 17743 }, { "epoch": 2.2572191833100113, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.29630661010742, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.872192919254303, "num_tokens": 677141406.0, "step": 17744 }, { "epoch": 2.257346393588602, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.825355529785156, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8523827195167542, "num_tokens": 677185968.0, "step": 17745 }, { "epoch": 2.2574736038671923, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.12165069580078, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8534704446792603, "num_tokens": 677229004.0, "step": 17746 }, { "epoch": 2.257600814145783, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.78011703491211, "learning_rate": 1e-06, "loss": 0.5419, "mean_token_accuracy": 0.8728755116462708, "num_tokens": 677266317.0, "step": 17747 }, { "epoch": 2.2577280244243734, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.853214263916016, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8713542222976685, "num_tokens": 677302344.0, "step": 17748 }, { "epoch": 2.257855234702964, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.660640716552734, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8539954423904419, "num_tokens": 677342103.0, "step": 17749 }, { "epoch": 2.2579824449815544, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.06694412231445, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8481965661048889, "num_tokens": 677378602.0, "step": 17750 }, { "epoch": 2.258109655260145, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.296146392822266, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8600708842277527, "num_tokens": 677415005.0, "step": 17751 }, { "epoch": 2.2582368655387355, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.15535354614258, "learning_rate": 1e-06, "loss": 0.6641, "mean_token_accuracy": 0.8441811203956604, "num_tokens": 677457908.0, "step": 17752 }, { "epoch": 2.258364075817326, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.90976333618164, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8508681654930115, "num_tokens": 677492759.0, "step": 17753 }, { "epoch": 2.2584912860959165, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.062557220458984, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8594319820404053, "num_tokens": 677535771.0, "step": 17754 }, { "epoch": 2.258618496374507, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.12791442871094, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8751706480979919, "num_tokens": 677575325.0, "step": 17755 }, { "epoch": 2.2587457066530976, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.09502410888672, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8623764514923096, "num_tokens": 677614132.0, "step": 17756 }, { "epoch": 2.258872916931688, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.483646392822266, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8550688028335571, "num_tokens": 677645657.0, "step": 17757 }, { "epoch": 2.2590001272102787, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.915260314941406, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8703365325927734, "num_tokens": 677686238.0, "step": 17758 }, { "epoch": 2.259127337488869, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.02244186401367, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8647515773773193, "num_tokens": 677724080.0, "step": 17759 }, { "epoch": 2.2592545477674597, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.289794921875, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8618343472480774, "num_tokens": 677765768.0, "step": 17760 }, { "epoch": 2.2593817580460502, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.13607406616211, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.86295485496521, "num_tokens": 677803937.0, "step": 17761 }, { "epoch": 2.2595089683246408, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.1881217956543, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8741958141326904, "num_tokens": 677839685.0, "step": 17762 }, { "epoch": 2.2596361786032313, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.14463806152344, "learning_rate": 1e-06, "loss": 0.6571, "mean_token_accuracy": 0.8488230109214783, "num_tokens": 677880411.0, "step": 17763 }, { "epoch": 2.259763388881822, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.12539291381836, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8707277178764343, "num_tokens": 677921177.0, "step": 17764 }, { "epoch": 2.2598905991604123, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.94906234741211, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8713580965995789, "num_tokens": 677956393.0, "step": 17765 }, { "epoch": 2.2600178094390024, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.943115234375, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8614561557769775, "num_tokens": 677988364.0, "step": 17766 }, { "epoch": 2.2601450197175934, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.82246780395508, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8686150312423706, "num_tokens": 678028093.0, "step": 17767 }, { "epoch": 2.2602722299961835, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.31929397583008, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.861998975276947, "num_tokens": 678068174.0, "step": 17768 }, { "epoch": 2.260399440274774, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.38060760498047, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8655263185501099, "num_tokens": 678105784.0, "step": 17769 }, { "epoch": 2.2605266505533645, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.50495910644531, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.873224139213562, "num_tokens": 678142873.0, "step": 17770 }, { "epoch": 2.260653860831955, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.2803840637207, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8583315014839172, "num_tokens": 678181527.0, "step": 17771 }, { "epoch": 2.2607810711105456, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.226951599121094, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8643784523010254, "num_tokens": 678221225.0, "step": 17772 }, { "epoch": 2.260908281389136, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.43962860107422, "learning_rate": 1e-06, "loss": 0.5546, "mean_token_accuracy": 0.8710616827011108, "num_tokens": 678254767.0, "step": 17773 }, { "epoch": 2.2610354916677267, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.203189849853516, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.864587664604187, "num_tokens": 678290830.0, "step": 17774 }, { "epoch": 2.261162701946317, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.788330078125, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8710500597953796, "num_tokens": 678327102.0, "step": 17775 }, { "epoch": 2.2612899122249077, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.063533782958984, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.8764424324035645, "num_tokens": 678364872.0, "step": 17776 }, { "epoch": 2.2614171225034982, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.011863708496094, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8545992374420166, "num_tokens": 678399233.0, "step": 17777 }, { "epoch": 2.2615443327820888, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.7304573059082, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8712841868400574, "num_tokens": 678434577.0, "step": 17778 }, { "epoch": 2.2616715430606793, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.598087310791016, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8670713305473328, "num_tokens": 678466835.0, "step": 17779 }, { "epoch": 2.26179875333927, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.16093063354492, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8781057596206665, "num_tokens": 678506817.0, "step": 17780 }, { "epoch": 2.2619259636178604, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.37594223022461, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8637871146202087, "num_tokens": 678543734.0, "step": 17781 }, { "epoch": 2.262053173896451, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.50303649902344, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8584129810333252, "num_tokens": 678585066.0, "step": 17782 }, { "epoch": 2.2621803841750414, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.257537841796875, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.8779441118240356, "num_tokens": 678614951.0, "step": 17783 }, { "epoch": 2.262307594453632, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.284400939941406, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8653651475906372, "num_tokens": 678649671.0, "step": 17784 }, { "epoch": 2.2624348047322225, "ewc_loss": 0.1875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.76717758178711, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8803092241287231, "num_tokens": 678693418.0, "step": 17785 }, { "epoch": 2.262562015010813, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.63993835449219, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8733952045440674, "num_tokens": 678730891.0, "step": 17786 }, { "epoch": 2.2626892252894035, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.816585540771484, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.870226263999939, "num_tokens": 678766369.0, "step": 17787 }, { "epoch": 2.262816435567994, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.53866958618164, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8528549671173096, "num_tokens": 678809118.0, "step": 17788 }, { "epoch": 2.2629436458465846, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.76874923706055, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8611868023872375, "num_tokens": 678850057.0, "step": 17789 }, { "epoch": 2.263070856125175, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.17332077026367, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8639890551567078, "num_tokens": 678890253.0, "step": 17790 }, { "epoch": 2.263198066403765, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.951560974121094, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.877737283706665, "num_tokens": 678922535.0, "step": 17791 }, { "epoch": 2.263325276682356, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.21089553833008, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8612151145935059, "num_tokens": 678956136.0, "step": 17792 }, { "epoch": 2.2634524869609463, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.964195251464844, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8653208613395691, "num_tokens": 678990864.0, "step": 17793 }, { "epoch": 2.263579697239537, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.09241485595703, "learning_rate": 1e-06, "loss": 0.5491, "mean_token_accuracy": 0.875571608543396, "num_tokens": 679028345.0, "step": 17794 }, { "epoch": 2.2637069075181273, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.09432601928711, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8497665524482727, "num_tokens": 679073679.0, "step": 17795 }, { "epoch": 2.263834117796718, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.713409423828125, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8621335029602051, "num_tokens": 679109274.0, "step": 17796 }, { "epoch": 2.2639613280753084, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.1131591796875, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8648127913475037, "num_tokens": 679141644.0, "step": 17797 }, { "epoch": 2.264088538353899, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.83002853393555, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8464905619621277, "num_tokens": 679184351.0, "step": 17798 }, { "epoch": 2.2642157486324894, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.326087951660156, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8597564697265625, "num_tokens": 679222509.0, "step": 17799 }, { "epoch": 2.26434295891108, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.144596099853516, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8472121357917786, "num_tokens": 679263362.0, "step": 17800 }, { "epoch": 2.2644701691896705, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.336490631103516, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8499222993850708, "num_tokens": 679307859.0, "step": 17801 }, { "epoch": 2.264597379468261, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.01650619506836, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.872652530670166, "num_tokens": 679344801.0, "step": 17802 }, { "epoch": 2.2647245897468515, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.965301513671875, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8554102182388306, "num_tokens": 679382795.0, "step": 17803 }, { "epoch": 2.264851800025442, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.167945861816406, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8606261014938354, "num_tokens": 679425282.0, "step": 17804 }, { "epoch": 2.2649790103040326, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.330345153808594, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8500818014144897, "num_tokens": 679466881.0, "step": 17805 }, { "epoch": 2.265106220582623, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.88149642944336, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.876111626625061, "num_tokens": 679509268.0, "step": 17806 }, { "epoch": 2.2652334308612136, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.36399841308594, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.859927237033844, "num_tokens": 679545082.0, "step": 17807 }, { "epoch": 2.265360641139804, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.87989807128906, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8636840581893921, "num_tokens": 679581795.0, "step": 17808 }, { "epoch": 2.2654878514183947, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.1457405090332, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8598971366882324, "num_tokens": 679618978.0, "step": 17809 }, { "epoch": 2.2656150616969852, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.82932662963867, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8662731647491455, "num_tokens": 679655002.0, "step": 17810 }, { "epoch": 2.2657422719755758, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.05038070678711, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8773306608200073, "num_tokens": 679691143.0, "step": 17811 }, { "epoch": 2.2658694822541663, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.54765319824219, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8644796013832092, "num_tokens": 679729168.0, "step": 17812 }, { "epoch": 2.265996692532757, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.788970947265625, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8622583746910095, "num_tokens": 679773385.0, "step": 17813 }, { "epoch": 2.2661239028113473, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.472904205322266, "learning_rate": 1e-06, "loss": 0.549, "mean_token_accuracy": 0.8751504421234131, "num_tokens": 679813884.0, "step": 17814 }, { "epoch": 2.266251113089938, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.378299713134766, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8644227385520935, "num_tokens": 679850834.0, "step": 17815 }, { "epoch": 2.266378323368528, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.36077880859375, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8682379126548767, "num_tokens": 679890210.0, "step": 17816 }, { "epoch": 2.266505533647119, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.499114990234375, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8731479644775391, "num_tokens": 679926252.0, "step": 17817 }, { "epoch": 2.266632743925709, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.99880599975586, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8589802980422974, "num_tokens": 679959626.0, "step": 17818 }, { "epoch": 2.2667599542042995, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.75758361816406, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8589935898780823, "num_tokens": 679999973.0, "step": 17819 }, { "epoch": 2.26688716448289, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.86474609375, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8560727834701538, "num_tokens": 680041081.0, "step": 17820 }, { "epoch": 2.2670143747614806, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.748409271240234, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8668117523193359, "num_tokens": 680078953.0, "step": 17821 }, { "epoch": 2.267141585040071, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.08247375488281, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8720811605453491, "num_tokens": 680112309.0, "step": 17822 }, { "epoch": 2.2672687953186617, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.083656311035156, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8584170937538147, "num_tokens": 680148607.0, "step": 17823 }, { "epoch": 2.267396005597252, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.881717681884766, "learning_rate": 1e-06, "loss": 0.5386, "mean_token_accuracy": 0.8799214959144592, "num_tokens": 680189617.0, "step": 17824 }, { "epoch": 2.2675232158758427, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.316261291503906, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8814449310302734, "num_tokens": 680227940.0, "step": 17825 }, { "epoch": 2.2676504261544332, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.946739196777344, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8764580488204956, "num_tokens": 680263473.0, "step": 17826 }, { "epoch": 2.2677776364330238, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.31182098388672, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.854464054107666, "num_tokens": 680297895.0, "step": 17827 }, { "epoch": 2.2679048467116143, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.408103942871094, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8641117811203003, "num_tokens": 680337658.0, "step": 17828 }, { "epoch": 2.268032056990205, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.82514572143555, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8596558570861816, "num_tokens": 680375912.0, "step": 17829 }, { "epoch": 2.2681592672687954, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.86096954345703, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8750612735748291, "num_tokens": 680415231.0, "step": 17830 }, { "epoch": 2.268286477547386, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.81045150756836, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8588913083076477, "num_tokens": 680460037.0, "step": 17831 }, { "epoch": 2.2684136878259764, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.62080764770508, "learning_rate": 1e-06, "loss": 0.6599, "mean_token_accuracy": 0.8405284881591797, "num_tokens": 680505521.0, "step": 17832 }, { "epoch": 2.268540898104567, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.56010818481445, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8763147592544556, "num_tokens": 680547682.0, "step": 17833 }, { "epoch": 2.2686681083831575, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.84185791015625, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8683301210403442, "num_tokens": 680587510.0, "step": 17834 }, { "epoch": 2.268795318661748, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.370487213134766, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8667076826095581, "num_tokens": 680624233.0, "step": 17835 }, { "epoch": 2.2689225289403385, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.04987716674805, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8602808713912964, "num_tokens": 680666091.0, "step": 17836 }, { "epoch": 2.269049739218929, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.24982452392578, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8704392313957214, "num_tokens": 680703062.0, "step": 17837 }, { "epoch": 2.2691769494975196, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.90972900390625, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8769811391830444, "num_tokens": 680735966.0, "step": 17838 }, { "epoch": 2.2693041597761097, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.03567886352539, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8635987639427185, "num_tokens": 680780010.0, "step": 17839 }, { "epoch": 2.2694313700547006, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.259246826171875, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8639754056930542, "num_tokens": 680819516.0, "step": 17840 }, { "epoch": 2.2695585803332907, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.80394744873047, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8622927665710449, "num_tokens": 680861374.0, "step": 17841 }, { "epoch": 2.2696857906118812, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.58755111694336, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.855541467666626, "num_tokens": 680902447.0, "step": 17842 }, { "epoch": 2.2698130008904718, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.75550842285156, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8813678622245789, "num_tokens": 680942376.0, "step": 17843 }, { "epoch": 2.2699402111690623, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.80181121826172, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8635258078575134, "num_tokens": 680977060.0, "step": 17844 }, { "epoch": 2.270067421447653, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.77125930786133, "learning_rate": 1e-06, "loss": 0.6437, "mean_token_accuracy": 0.8472803235054016, "num_tokens": 681014504.0, "step": 17845 }, { "epoch": 2.2701946317262434, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.1781120300293, "learning_rate": 1e-06, "loss": 0.6424, "mean_token_accuracy": 0.8470525741577148, "num_tokens": 681053919.0, "step": 17846 }, { "epoch": 2.270321842004834, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.10157775878906, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8647744059562683, "num_tokens": 681090883.0, "step": 17847 }, { "epoch": 2.2704490522834244, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.178897857666016, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8797944784164429, "num_tokens": 681128819.0, "step": 17848 }, { "epoch": 2.270576262562015, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.00168228149414, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8536385893821716, "num_tokens": 681171434.0, "step": 17849 }, { "epoch": 2.2707034728406055, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.57375717163086, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8734787702560425, "num_tokens": 681206297.0, "step": 17850 }, { "epoch": 2.270830683119196, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.84359359741211, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8531754612922668, "num_tokens": 681242521.0, "step": 17851 }, { "epoch": 2.2709578933977865, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.417301177978516, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8729366660118103, "num_tokens": 681282474.0, "step": 17852 }, { "epoch": 2.271085103676377, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 47.940269470214844, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8597333431243896, "num_tokens": 681317280.0, "step": 17853 }, { "epoch": 2.2712123139549676, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.35239028930664, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8615837097167969, "num_tokens": 681349155.0, "step": 17854 }, { "epoch": 2.271339524233558, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.17521667480469, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8633021116256714, "num_tokens": 681380863.0, "step": 17855 }, { "epoch": 2.2714667345121486, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.191619873046875, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8676105737686157, "num_tokens": 681421970.0, "step": 17856 }, { "epoch": 2.271593944790739, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.99148178100586, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8772964477539062, "num_tokens": 681453511.0, "step": 17857 }, { "epoch": 2.2717211550693297, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.69804763793945, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8512248992919922, "num_tokens": 681496293.0, "step": 17858 }, { "epoch": 2.2718483653479202, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.00974655151367, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8557323217391968, "num_tokens": 681537074.0, "step": 17859 }, { "epoch": 2.2719755756265108, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.9964599609375, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8563327789306641, "num_tokens": 681578911.0, "step": 17860 }, { "epoch": 2.2721027859051013, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.54994201660156, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8614029884338379, "num_tokens": 681622134.0, "step": 17861 }, { "epoch": 2.272229996183692, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.977909088134766, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8645445108413696, "num_tokens": 681655919.0, "step": 17862 }, { "epoch": 2.2723572064622823, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.769630432128906, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8539090752601624, "num_tokens": 681696539.0, "step": 17863 }, { "epoch": 2.2724844167408724, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.9354133605957, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8610559701919556, "num_tokens": 681734630.0, "step": 17864 }, { "epoch": 2.2726116270194634, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.52482986450195, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8517982363700867, "num_tokens": 681779669.0, "step": 17865 }, { "epoch": 2.2727388372980535, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.87209701538086, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8613734841346741, "num_tokens": 681821920.0, "step": 17866 }, { "epoch": 2.272866047576644, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.078773498535156, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.86054527759552, "num_tokens": 681859239.0, "step": 17867 }, { "epoch": 2.2729932578552345, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.135292053222656, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.8493145704269409, "num_tokens": 681896569.0, "step": 17868 }, { "epoch": 2.273120468133825, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.07802200317383, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8786181807518005, "num_tokens": 681932549.0, "step": 17869 }, { "epoch": 2.2732476784124156, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.743804931640625, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8587163686752319, "num_tokens": 681973981.0, "step": 17870 }, { "epoch": 2.273374888691006, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.90217971801758, "learning_rate": 1e-06, "loss": 0.6841, "mean_token_accuracy": 0.8398237228393555, "num_tokens": 682009512.0, "step": 17871 }, { "epoch": 2.2735020989695967, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.59381103515625, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8591713905334473, "num_tokens": 682048058.0, "step": 17872 }, { "epoch": 2.273629309248187, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.348182678222656, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8561466932296753, "num_tokens": 682084181.0, "step": 17873 }, { "epoch": 2.2737565195267777, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.08734893798828, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8845505714416504, "num_tokens": 682119613.0, "step": 17874 }, { "epoch": 2.2738837298053682, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.566429138183594, "learning_rate": 1e-06, "loss": 0.5729, "mean_token_accuracy": 0.8666171431541443, "num_tokens": 682159501.0, "step": 17875 }, { "epoch": 2.2740109400839588, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.045528411865234, "learning_rate": 1e-06, "loss": 0.5582, "mean_token_accuracy": 0.8722843527793884, "num_tokens": 682197323.0, "step": 17876 }, { "epoch": 2.2741381503625493, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.682010650634766, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8648310899734497, "num_tokens": 682231150.0, "step": 17877 }, { "epoch": 2.27426536064114, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.46375274658203, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8640186786651611, "num_tokens": 682269351.0, "step": 17878 }, { "epoch": 2.2743925709197303, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.88239288330078, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8725200891494751, "num_tokens": 682303633.0, "step": 17879 }, { "epoch": 2.274519781198321, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.85275650024414, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8627626299858093, "num_tokens": 682340890.0, "step": 17880 }, { "epoch": 2.2746469914769114, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.522491455078125, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8792356252670288, "num_tokens": 682374253.0, "step": 17881 }, { "epoch": 2.274774201755502, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.11183547973633, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8629633188247681, "num_tokens": 682412675.0, "step": 17882 }, { "epoch": 2.2749014120340925, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.296783447265625, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.873355507850647, "num_tokens": 682447864.0, "step": 17883 }, { "epoch": 2.275028622312683, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.23957061767578, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8645820617675781, "num_tokens": 682486445.0, "step": 17884 }, { "epoch": 2.2751558325912735, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.96662139892578, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.862505316734314, "num_tokens": 682518173.0, "step": 17885 }, { "epoch": 2.275283042869864, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.17826461791992, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8626790046691895, "num_tokens": 682558627.0, "step": 17886 }, { "epoch": 2.2754102531484546, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.842613220214844, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8691303730010986, "num_tokens": 682600821.0, "step": 17887 }, { "epoch": 2.275537463427045, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.381309509277344, "learning_rate": 1e-06, "loss": 0.6272, "mean_token_accuracy": 0.8528212904930115, "num_tokens": 682643947.0, "step": 17888 }, { "epoch": 2.275664673705635, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.01396942138672, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8571045994758606, "num_tokens": 682683045.0, "step": 17889 }, { "epoch": 2.275791883984226, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.053646087646484, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8629085421562195, "num_tokens": 682721617.0, "step": 17890 }, { "epoch": 2.2759190942628162, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.04802322387695, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8556139469146729, "num_tokens": 682758339.0, "step": 17891 }, { "epoch": 2.2760463045414068, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.99642562866211, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8695898652076721, "num_tokens": 682800501.0, "step": 17892 }, { "epoch": 2.2761735148199973, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.23489761352539, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8621959090232849, "num_tokens": 682833231.0, "step": 17893 }, { "epoch": 2.276300725098588, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.70816421508789, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8688744306564331, "num_tokens": 682873945.0, "step": 17894 }, { "epoch": 2.2764279353771784, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.43121337890625, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8618252277374268, "num_tokens": 682918329.0, "step": 17895 }, { "epoch": 2.276555145655769, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.93079376220703, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8705092668533325, "num_tokens": 682956277.0, "step": 17896 }, { "epoch": 2.2766823559343594, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.04972457885742, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8654178380966187, "num_tokens": 682990648.0, "step": 17897 }, { "epoch": 2.27680956621295, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.005374908447266, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8762069940567017, "num_tokens": 683029046.0, "step": 17898 }, { "epoch": 2.2769367764915405, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.879764556884766, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8632354140281677, "num_tokens": 683064541.0, "step": 17899 }, { "epoch": 2.277063986770131, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.257720947265625, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8625810146331787, "num_tokens": 683105015.0, "step": 17900 }, { "epoch": 2.2771911970487215, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.09775924682617, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8567726612091064, "num_tokens": 683142938.0, "step": 17901 }, { "epoch": 2.277318407327312, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.16462326049805, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8776841163635254, "num_tokens": 683181593.0, "step": 17902 }, { "epoch": 2.2774456176059026, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.25861740112305, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8521913886070251, "num_tokens": 683218641.0, "step": 17903 }, { "epoch": 2.277572827884493, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.730533599853516, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8471306562423706, "num_tokens": 683261479.0, "step": 17904 }, { "epoch": 2.2777000381630836, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.556060791015625, "learning_rate": 1e-06, "loss": 0.6685, "mean_token_accuracy": 0.838773250579834, "num_tokens": 683300602.0, "step": 17905 }, { "epoch": 2.277827248441674, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.50338363647461, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8617233037948608, "num_tokens": 683337012.0, "step": 17906 }, { "epoch": 2.2779544587202647, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.52054214477539, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.870836615562439, "num_tokens": 683368949.0, "step": 17907 }, { "epoch": 2.2780816689988552, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.86936950683594, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8611364364624023, "num_tokens": 683408741.0, "step": 17908 }, { "epoch": 2.2782088792774458, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.477272033691406, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8538686633110046, "num_tokens": 683447843.0, "step": 17909 }, { "epoch": 2.2783360895560363, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.220035552978516, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8608580231666565, "num_tokens": 683489695.0, "step": 17910 }, { "epoch": 2.278463299834627, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.278526306152344, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8609598278999329, "num_tokens": 683528381.0, "step": 17911 }, { "epoch": 2.2785905101132173, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.044681549072266, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8655770421028137, "num_tokens": 683567261.0, "step": 17912 }, { "epoch": 2.278717720391808, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.773956298828125, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.846967339515686, "num_tokens": 683601083.0, "step": 17913 }, { "epoch": 2.278844930670398, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.40596389770508, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8569594621658325, "num_tokens": 683643494.0, "step": 17914 }, { "epoch": 2.278972140948989, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.73401641845703, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8592509031295776, "num_tokens": 683684267.0, "step": 17915 }, { "epoch": 2.279099351227579, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.354312896728516, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8519900441169739, "num_tokens": 683723292.0, "step": 17916 }, { "epoch": 2.2792265615061695, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.219051361083984, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8745688796043396, "num_tokens": 683761378.0, "step": 17917 }, { "epoch": 2.27935377178476, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.46070861816406, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.854012668132782, "num_tokens": 683794838.0, "step": 17918 }, { "epoch": 2.2794809820633506, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.277164459228516, "learning_rate": 1e-06, "loss": 0.6546, "mean_token_accuracy": 0.8452143669128418, "num_tokens": 683832307.0, "step": 17919 }, { "epoch": 2.279608192341941, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.08375549316406, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8549165725708008, "num_tokens": 683871848.0, "step": 17920 }, { "epoch": 2.2797354026205316, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.48019790649414, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8658813238143921, "num_tokens": 683915086.0, "step": 17921 }, { "epoch": 2.279862612899122, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.97443389892578, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8547555804252625, "num_tokens": 683945411.0, "step": 17922 }, { "epoch": 2.2799898231777127, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.560386657714844, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8626381754875183, "num_tokens": 683979664.0, "step": 17923 }, { "epoch": 2.2801170334563032, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.035037994384766, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8529877662658691, "num_tokens": 684025526.0, "step": 17924 }, { "epoch": 2.2802442437348938, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.676666259765625, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8557908535003662, "num_tokens": 684056992.0, "step": 17925 }, { "epoch": 2.2803714540134843, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.673423767089844, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8663782477378845, "num_tokens": 684094515.0, "step": 17926 }, { "epoch": 2.280498664292075, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.22690963745117, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8702040910720825, "num_tokens": 684132294.0, "step": 17927 }, { "epoch": 2.2806258745706653, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.28512954711914, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8658609986305237, "num_tokens": 684164010.0, "step": 17928 }, { "epoch": 2.280753084849256, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.3016471862793, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8719239234924316, "num_tokens": 684202077.0, "step": 17929 }, { "epoch": 2.2808802951278464, "ewc_loss": 0.18359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016021728515625, "grad_norm": 47.84484100341797, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8365388512611389, "num_tokens": 684240799.0, "step": 17930 }, { "epoch": 2.281007505406437, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.86172866821289, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8600186109542847, "num_tokens": 684284080.0, "step": 17931 }, { "epoch": 2.2811347156850275, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.24506378173828, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8453185558319092, "num_tokens": 684317915.0, "step": 17932 }, { "epoch": 2.281261925963618, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.26420211791992, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.9009027481079102, "num_tokens": 684354665.0, "step": 17933 }, { "epoch": 2.2813891362422085, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.70059585571289, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8617926836013794, "num_tokens": 684392866.0, "step": 17934 }, { "epoch": 2.281516346520799, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.46812057495117, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8662352561950684, "num_tokens": 684428422.0, "step": 17935 }, { "epoch": 2.2816435567993896, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.82085418701172, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8495856523513794, "num_tokens": 684467129.0, "step": 17936 }, { "epoch": 2.2817707670779797, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.756141662597656, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8553009629249573, "num_tokens": 684500093.0, "step": 17937 }, { "epoch": 2.2818979773565706, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.55805587768555, "learning_rate": 1e-06, "loss": 0.6328, "mean_token_accuracy": 0.8530591726303101, "num_tokens": 684541422.0, "step": 17938 }, { "epoch": 2.2820251876351607, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.39447784423828, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8568670749664307, "num_tokens": 684579164.0, "step": 17939 }, { "epoch": 2.2821523979137512, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.83686065673828, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.8466225862503052, "num_tokens": 684618617.0, "step": 17940 }, { "epoch": 2.2822796081923418, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.328548431396484, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8472298383712769, "num_tokens": 684655525.0, "step": 17941 }, { "epoch": 2.2824068184709323, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.713375091552734, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8609859943389893, "num_tokens": 684694525.0, "step": 17942 }, { "epoch": 2.282534028749523, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.27192687988281, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8612851500511169, "num_tokens": 684733247.0, "step": 17943 }, { "epoch": 2.2826612390281134, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.44396209716797, "learning_rate": 1e-06, "loss": 0.5495, "mean_token_accuracy": 0.8736135959625244, "num_tokens": 684763642.0, "step": 17944 }, { "epoch": 2.282788449306704, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.30487060546875, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8677711486816406, "num_tokens": 684806164.0, "step": 17945 }, { "epoch": 2.2829156595852944, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.86969757080078, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8622356057167053, "num_tokens": 684844193.0, "step": 17946 }, { "epoch": 2.283042869863885, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.37983703613281, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8681873083114624, "num_tokens": 684882103.0, "step": 17947 }, { "epoch": 2.2831700801424755, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.58032989501953, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8763951063156128, "num_tokens": 684917667.0, "step": 17948 }, { "epoch": 2.283297290421066, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.875301361083984, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8662122488021851, "num_tokens": 684954007.0, "step": 17949 }, { "epoch": 2.2834245006996565, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.35883712768555, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8564651608467102, "num_tokens": 684994693.0, "step": 17950 }, { "epoch": 2.283551710978247, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.819210052490234, "learning_rate": 1e-06, "loss": 0.6386, "mean_token_accuracy": 0.8494978547096252, "num_tokens": 685032713.0, "step": 17951 }, { "epoch": 2.2836789212568376, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.5444221496582, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8562763929367065, "num_tokens": 685066597.0, "step": 17952 }, { "epoch": 2.283806131535428, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.572200775146484, "learning_rate": 1e-06, "loss": 0.6078, "mean_token_accuracy": 0.8565880060195923, "num_tokens": 685106243.0, "step": 17953 }, { "epoch": 2.2839333418140186, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.601070404052734, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8541878461837769, "num_tokens": 685145177.0, "step": 17954 }, { "epoch": 2.284060552092609, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.601802825927734, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8685213923454285, "num_tokens": 685189130.0, "step": 17955 }, { "epoch": 2.2841877623711997, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.77736282348633, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8659181594848633, "num_tokens": 685227536.0, "step": 17956 }, { "epoch": 2.28431497264979, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.33972930908203, "learning_rate": 1e-06, "loss": 0.5797, "mean_token_accuracy": 0.8675776720046997, "num_tokens": 685267213.0, "step": 17957 }, { "epoch": 2.2844421829283807, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.88194274902344, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8607476949691772, "num_tokens": 685301115.0, "step": 17958 }, { "epoch": 2.2845693932069713, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.24001693725586, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8685830235481262, "num_tokens": 685338497.0, "step": 17959 }, { "epoch": 2.284696603485562, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.133819580078125, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8509908318519592, "num_tokens": 685374983.0, "step": 17960 }, { "epoch": 2.2848238137641523, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.039825439453125, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8690156936645508, "num_tokens": 685408859.0, "step": 17961 }, { "epoch": 2.2849510240427424, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.60623550415039, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8593482971191406, "num_tokens": 685445103.0, "step": 17962 }, { "epoch": 2.2850782343213334, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.4969482421875, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8579429388046265, "num_tokens": 685484657.0, "step": 17963 }, { "epoch": 2.2852054445999235, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.47715759277344, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.870253324508667, "num_tokens": 685524034.0, "step": 17964 }, { "epoch": 2.285332654878514, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.33585739135742, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8678843975067139, "num_tokens": 685560368.0, "step": 17965 }, { "epoch": 2.2854598651571045, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.82411575317383, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8583410978317261, "num_tokens": 685598192.0, "step": 17966 }, { "epoch": 2.285587075435695, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.048187255859375, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8659855127334595, "num_tokens": 685636300.0, "step": 17967 }, { "epoch": 2.2857142857142856, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.87607955932617, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8600390553474426, "num_tokens": 685671457.0, "step": 17968 }, { "epoch": 2.285841495992876, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.291934967041016, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8508896827697754, "num_tokens": 685709687.0, "step": 17969 }, { "epoch": 2.2859687062714666, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.073150634765625, "learning_rate": 1e-06, "loss": 0.6645, "mean_token_accuracy": 0.8413315415382385, "num_tokens": 685747854.0, "step": 17970 }, { "epoch": 2.286095916550057, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.01682662963867, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8817861676216125, "num_tokens": 685785215.0, "step": 17971 }, { "epoch": 2.2862231268286477, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.85624694824219, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8662041425704956, "num_tokens": 685818871.0, "step": 17972 }, { "epoch": 2.2863503371072382, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.33465576171875, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.854638934135437, "num_tokens": 685859089.0, "step": 17973 }, { "epoch": 2.2864775473858288, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.60173797607422, "learning_rate": 1e-06, "loss": 0.6245, "mean_token_accuracy": 0.855204164981842, "num_tokens": 685900346.0, "step": 17974 }, { "epoch": 2.2866047576644193, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.34257507324219, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8612954616546631, "num_tokens": 685938186.0, "step": 17975 }, { "epoch": 2.28673196794301, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.495849609375, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8691779375076294, "num_tokens": 685974911.0, "step": 17976 }, { "epoch": 2.2868591782216003, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.599159240722656, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8646626472473145, "num_tokens": 686010430.0, "step": 17977 }, { "epoch": 2.286986388500191, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.69284439086914, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8736390471458435, "num_tokens": 686042727.0, "step": 17978 }, { "epoch": 2.2871135987787814, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.37787628173828, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8638101816177368, "num_tokens": 686078819.0, "step": 17979 }, { "epoch": 2.287240809057372, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.96828079223633, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8639860153198242, "num_tokens": 686117760.0, "step": 17980 }, { "epoch": 2.2873680193359625, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.82590103149414, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8548057079315186, "num_tokens": 686152992.0, "step": 17981 }, { "epoch": 2.287495229614553, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.249114990234375, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8573206663131714, "num_tokens": 686195353.0, "step": 17982 }, { "epoch": 2.2876224398931435, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.72727584838867, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8582921028137207, "num_tokens": 686232990.0, "step": 17983 }, { "epoch": 2.287749650171734, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.97349166870117, "learning_rate": 1e-06, "loss": 0.5761, "mean_token_accuracy": 0.8674157857894897, "num_tokens": 686265933.0, "step": 17984 }, { "epoch": 2.2878768604503246, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.798038482666016, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8681231737136841, "num_tokens": 686307111.0, "step": 17985 }, { "epoch": 2.288004070728915, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.005855560302734, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8639199733734131, "num_tokens": 686343363.0, "step": 17986 }, { "epoch": 2.288131281007505, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.41695022583008, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8603856563568115, "num_tokens": 686380888.0, "step": 17987 }, { "epoch": 2.288258491286096, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.594905853271484, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8543000817298889, "num_tokens": 686417350.0, "step": 17988 }, { "epoch": 2.2883857015646862, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.35346221923828, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8710666298866272, "num_tokens": 686458658.0, "step": 17989 }, { "epoch": 2.2885129118432768, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.883766174316406, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8656023740768433, "num_tokens": 686493945.0, "step": 17990 }, { "epoch": 2.2886401221218673, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.19904708862305, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8618613481521606, "num_tokens": 686527608.0, "step": 17991 }, { "epoch": 2.288767332400458, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.00322341918945, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8731407523155212, "num_tokens": 686562272.0, "step": 17992 }, { "epoch": 2.2888945426790483, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.958797454833984, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.8742636442184448, "num_tokens": 686599937.0, "step": 17993 }, { "epoch": 2.289021752957639, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.048065185546875, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8645139932632446, "num_tokens": 686632103.0, "step": 17994 }, { "epoch": 2.2891489632362294, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.24689483642578, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8703145384788513, "num_tokens": 686662802.0, "step": 17995 }, { "epoch": 2.28927617351482, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.9455680847168, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8836327195167542, "num_tokens": 686699386.0, "step": 17996 }, { "epoch": 2.2894033837934105, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.049293518066406, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8645314574241638, "num_tokens": 686738369.0, "step": 17997 }, { "epoch": 2.289530594072001, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.32840347290039, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8549254536628723, "num_tokens": 686774543.0, "step": 17998 }, { "epoch": 2.2896578043505915, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.19314193725586, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8525301218032837, "num_tokens": 686807048.0, "step": 17999 }, { "epoch": 2.289785014629182, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.597408294677734, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8625296354293823, "num_tokens": 686844678.0, "step": 18000 }, { "epoch": 2.2899122249077726, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.113319396972656, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8624614477157593, "num_tokens": 686877721.0, "step": 18001 }, { "epoch": 2.290039435186363, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.714744567871094, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8535613417625427, "num_tokens": 686918329.0, "step": 18002 }, { "epoch": 2.2901666454649536, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.04297637939453, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8590201139450073, "num_tokens": 686960050.0, "step": 18003 }, { "epoch": 2.290293855743544, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.24079895019531, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8741820454597473, "num_tokens": 686997849.0, "step": 18004 }, { "epoch": 2.2904210660221347, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.98371124267578, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8673566579818726, "num_tokens": 687032363.0, "step": 18005 }, { "epoch": 2.290548276300725, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.01197814941406, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8463779091835022, "num_tokens": 687069606.0, "step": 18006 }, { "epoch": 2.2906754865793157, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.8316650390625, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8528802394866943, "num_tokens": 687112857.0, "step": 18007 }, { "epoch": 2.2908026968579063, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.379859924316406, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8602948784828186, "num_tokens": 687152245.0, "step": 18008 }, { "epoch": 2.290929907136497, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.96315002441406, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8620507717132568, "num_tokens": 687193988.0, "step": 18009 }, { "epoch": 2.2910571174150873, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.556671142578125, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8574533462524414, "num_tokens": 687232375.0, "step": 18010 }, { "epoch": 2.291184327693678, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.17627716064453, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8669501543045044, "num_tokens": 687274566.0, "step": 18011 }, { "epoch": 2.291311537972268, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.0544319152832, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8617536425590515, "num_tokens": 687313231.0, "step": 18012 }, { "epoch": 2.291438748250859, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.390289306640625, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8672817945480347, "num_tokens": 687354502.0, "step": 18013 }, { "epoch": 2.291565958529449, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.4440803527832, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8595086336135864, "num_tokens": 687399160.0, "step": 18014 }, { "epoch": 2.2916931688080395, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.39027404785156, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8529424667358398, "num_tokens": 687438775.0, "step": 18015 }, { "epoch": 2.29182037908663, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.14574432373047, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8683794140815735, "num_tokens": 687471435.0, "step": 18016 }, { "epoch": 2.2919475893652206, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.58492660522461, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8630462884902954, "num_tokens": 687508657.0, "step": 18017 }, { "epoch": 2.292074799643811, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.30189895629883, "learning_rate": 1e-06, "loss": 0.5589, "mean_token_accuracy": 0.8719311952590942, "num_tokens": 687545024.0, "step": 18018 }, { "epoch": 2.2922020099224016, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.03086853027344, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8654041290283203, "num_tokens": 687579432.0, "step": 18019 }, { "epoch": 2.292329220200992, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.3004264831543, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8519204258918762, "num_tokens": 687618378.0, "step": 18020 }, { "epoch": 2.2924564304795827, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.039710998535156, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8593649864196777, "num_tokens": 687657119.0, "step": 18021 }, { "epoch": 2.2925836407581732, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.545101165771484, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8521720170974731, "num_tokens": 687691924.0, "step": 18022 }, { "epoch": 2.2927108510367638, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.028594970703125, "learning_rate": 1e-06, "loss": 0.6657, "mean_token_accuracy": 0.838137686252594, "num_tokens": 687724221.0, "step": 18023 }, { "epoch": 2.2928380613153543, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.35152053833008, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.864246666431427, "num_tokens": 687765049.0, "step": 18024 }, { "epoch": 2.292965271593945, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.25545120239258, "learning_rate": 1e-06, "loss": 0.6347, "mean_token_accuracy": 0.8513036370277405, "num_tokens": 687810222.0, "step": 18025 }, { "epoch": 2.2930924818725353, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.48134231567383, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8560150861740112, "num_tokens": 687851961.0, "step": 18026 }, { "epoch": 2.293219692151126, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.83943176269531, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8495262861251831, "num_tokens": 687890448.0, "step": 18027 }, { "epoch": 2.2933469024297164, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.491905212402344, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8705754280090332, "num_tokens": 687929509.0, "step": 18028 }, { "epoch": 2.293474112708307, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.02021789550781, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8530726432800293, "num_tokens": 687966814.0, "step": 18029 }, { "epoch": 2.2936013229868975, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.462303161621094, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8700740337371826, "num_tokens": 688006398.0, "step": 18030 }, { "epoch": 2.293728533265488, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.03623962402344, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8585927486419678, "num_tokens": 688044604.0, "step": 18031 }, { "epoch": 2.2938557435440785, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.45106506347656, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8582595586776733, "num_tokens": 688082969.0, "step": 18032 }, { "epoch": 2.293982953822669, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.535770416259766, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8696768283843994, "num_tokens": 688121442.0, "step": 18033 }, { "epoch": 2.2941101641012596, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.35752868652344, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8629268407821655, "num_tokens": 688161096.0, "step": 18034 }, { "epoch": 2.2942373743798496, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.80129623413086, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8644265532493591, "num_tokens": 688200782.0, "step": 18035 }, { "epoch": 2.2943645846584406, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.405174255371094, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8703840970993042, "num_tokens": 688241333.0, "step": 18036 }, { "epoch": 2.2944917949370307, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.993534088134766, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8608558177947998, "num_tokens": 688278297.0, "step": 18037 }, { "epoch": 2.2946190052156212, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.29018783569336, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8388587236404419, "num_tokens": 688318835.0, "step": 18038 }, { "epoch": 2.2947462154942118, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.05351638793945, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8648984432220459, "num_tokens": 688354814.0, "step": 18039 }, { "epoch": 2.2948734257728023, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.98443603515625, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8658586740493774, "num_tokens": 688390008.0, "step": 18040 }, { "epoch": 2.295000636051393, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.15808868408203, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.863196849822998, "num_tokens": 688430938.0, "step": 18041 }, { "epoch": 2.2951278463299833, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.41123580932617, "learning_rate": 1e-06, "loss": 0.7137, "mean_token_accuracy": 0.8308601379394531, "num_tokens": 688466730.0, "step": 18042 }, { "epoch": 2.295255056608574, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.995933532714844, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8620331287384033, "num_tokens": 688506666.0, "step": 18043 }, { "epoch": 2.2953822668871644, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.36043167114258, "learning_rate": 1e-06, "loss": 0.6508, "mean_token_accuracy": 0.8480250239372253, "num_tokens": 688547726.0, "step": 18044 }, { "epoch": 2.295509477165755, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.07008743286133, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8573576807975769, "num_tokens": 688587487.0, "step": 18045 }, { "epoch": 2.2956366874443455, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.794551849365234, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8554270267486572, "num_tokens": 688632454.0, "step": 18046 }, { "epoch": 2.295763897722936, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.70409393310547, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8684037327766418, "num_tokens": 688668489.0, "step": 18047 }, { "epoch": 2.2958911080015265, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.150272369384766, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8654700517654419, "num_tokens": 688707899.0, "step": 18048 }, { "epoch": 2.296018318280117, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.089149475097656, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8676469922065735, "num_tokens": 688747171.0, "step": 18049 }, { "epoch": 2.2961455285587076, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 48.258514404296875, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8603538870811462, "num_tokens": 688789553.0, "step": 18050 }, { "epoch": 2.296272738837298, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3126602172851562e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.68461990356445, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.864388644695282, "num_tokens": 688827632.0, "step": 18051 }, { "epoch": 2.2963999491158886, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.44120407104492, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8714404702186584, "num_tokens": 688861505.0, "step": 18052 }, { "epoch": 2.296527159394479, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 49.08154296875, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8557522892951965, "num_tokens": 688898270.0, "step": 18053 }, { "epoch": 2.2966543696730697, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.56215286254883, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8615991473197937, "num_tokens": 688937420.0, "step": 18054 }, { "epoch": 2.29678157995166, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.62478256225586, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8693625330924988, "num_tokens": 688972426.0, "step": 18055 }, { "epoch": 2.2969087902302507, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.68190002441406, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8599091172218323, "num_tokens": 689005801.0, "step": 18056 }, { "epoch": 2.2970360005088413, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.332489013671875, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8566684722900391, "num_tokens": 689039999.0, "step": 18057 }, { "epoch": 2.297163210787432, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.5556526184082, "learning_rate": 1e-06, "loss": 0.653, "mean_token_accuracy": 0.846017599105835, "num_tokens": 689079404.0, "step": 18058 }, { "epoch": 2.2972904210660223, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.611576080322266, "learning_rate": 1e-06, "loss": 0.6726, "mean_token_accuracy": 0.8375186324119568, "num_tokens": 689116197.0, "step": 18059 }, { "epoch": 2.2974176313446124, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.80327606201172, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8623057007789612, "num_tokens": 689153446.0, "step": 18060 }, { "epoch": 2.2975448416232034, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.30964660644531, "learning_rate": 1e-06, "loss": 0.6457, "mean_token_accuracy": 0.8454214334487915, "num_tokens": 689185310.0, "step": 18061 }, { "epoch": 2.2976720519017935, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.82514953613281, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8721828460693359, "num_tokens": 689224150.0, "step": 18062 }, { "epoch": 2.297799262180384, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.72737503051758, "learning_rate": 1e-06, "loss": 0.6337, "mean_token_accuracy": 0.8531913161277771, "num_tokens": 689261206.0, "step": 18063 }, { "epoch": 2.2979264724589745, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.135215759277344, "learning_rate": 1e-06, "loss": 0.5782, "mean_token_accuracy": 0.867634117603302, "num_tokens": 689299129.0, "step": 18064 }, { "epoch": 2.298053682737565, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.99935531616211, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8785334229469299, "num_tokens": 689330554.0, "step": 18065 }, { "epoch": 2.2981808930161556, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.707359313964844, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8684905171394348, "num_tokens": 689364084.0, "step": 18066 }, { "epoch": 2.298308103294746, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.23716735839844, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8493495583534241, "num_tokens": 689405254.0, "step": 18067 }, { "epoch": 2.2984353135733366, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.448387145996094, "learning_rate": 1e-06, "loss": 0.6329, "mean_token_accuracy": 0.8529911041259766, "num_tokens": 689441109.0, "step": 18068 }, { "epoch": 2.298562523851927, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.57395935058594, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8604491353034973, "num_tokens": 689474937.0, "step": 18069 }, { "epoch": 2.2986897341305177, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.278263092041016, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8573973774909973, "num_tokens": 689515971.0, "step": 18070 }, { "epoch": 2.298816944409108, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.4511604309082, "learning_rate": 1e-06, "loss": 0.6676, "mean_token_accuracy": 0.8382309675216675, "num_tokens": 689553620.0, "step": 18071 }, { "epoch": 2.2989441546876987, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.93138885498047, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8650254011154175, "num_tokens": 689592157.0, "step": 18072 }, { "epoch": 2.2990713649662893, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.166988372802734, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8751336932182312, "num_tokens": 689626889.0, "step": 18073 }, { "epoch": 2.29919857524488, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.185516357421875, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8725464940071106, "num_tokens": 689666161.0, "step": 18074 }, { "epoch": 2.2993257855234703, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.31998062133789, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8532878160476685, "num_tokens": 689713713.0, "step": 18075 }, { "epoch": 2.299452995802061, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.101661682128906, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8535773754119873, "num_tokens": 689755146.0, "step": 18076 }, { "epoch": 2.2995802060806514, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.61068344116211, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8722561597824097, "num_tokens": 689795925.0, "step": 18077 }, { "epoch": 2.299707416359242, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.662071228027344, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8744567036628723, "num_tokens": 689830673.0, "step": 18078 }, { "epoch": 2.2998346266378324, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.800079345703125, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8599424362182617, "num_tokens": 689863649.0, "step": 18079 }, { "epoch": 2.299961836916423, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.535213470458984, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8604979515075684, "num_tokens": 689901946.0, "step": 18080 }, { "epoch": 2.3000890471950135, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.31280517578125, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8562325239181519, "num_tokens": 689939968.0, "step": 18081 }, { "epoch": 2.300216257473604, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.11734390258789, "learning_rate": 1e-06, "loss": 0.5703, "mean_token_accuracy": 0.8702579736709595, "num_tokens": 689979792.0, "step": 18082 }, { "epoch": 2.3003434677521946, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.877986907958984, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.874052882194519, "num_tokens": 690013974.0, "step": 18083 }, { "epoch": 2.300470678030785, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.12628173828125, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8706271648406982, "num_tokens": 690056540.0, "step": 18084 }, { "epoch": 2.300597888309375, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.03588104248047, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8543476462364197, "num_tokens": 690101032.0, "step": 18085 }, { "epoch": 2.300725098587966, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.502872467041016, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8751121759414673, "num_tokens": 690135417.0, "step": 18086 }, { "epoch": 2.3008523088665562, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.838645935058594, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8637098073959351, "num_tokens": 690178552.0, "step": 18087 }, { "epoch": 2.3009795191451468, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.402645111083984, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8590503931045532, "num_tokens": 690221928.0, "step": 18088 }, { "epoch": 2.3011067294237373, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.04984664916992, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8681604862213135, "num_tokens": 690257040.0, "step": 18089 }, { "epoch": 2.301233939702328, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.43716049194336, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8610994815826416, "num_tokens": 690302720.0, "step": 18090 }, { "epoch": 2.3013611499809183, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.77096176147461, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8554080128669739, "num_tokens": 690337042.0, "step": 18091 }, { "epoch": 2.301488360259509, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.2261962890625, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8693622946739197, "num_tokens": 690380247.0, "step": 18092 }, { "epoch": 2.3016155705380994, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.339134216308594, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8639654517173767, "num_tokens": 690425361.0, "step": 18093 }, { "epoch": 2.30174278081669, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.84004592895508, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8576219081878662, "num_tokens": 690465137.0, "step": 18094 }, { "epoch": 2.3018699910952805, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.49614715576172, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8604614734649658, "num_tokens": 690501437.0, "step": 18095 }, { "epoch": 2.301997201373871, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.013004302978516, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8575843572616577, "num_tokens": 690544134.0, "step": 18096 }, { "epoch": 2.3021244116524615, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.49939727783203, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8709670305252075, "num_tokens": 690581062.0, "step": 18097 }, { "epoch": 2.302251621931052, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.15614318847656, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8677088022232056, "num_tokens": 690615145.0, "step": 18098 }, { "epoch": 2.3023788322096426, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.370384216308594, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8680237531661987, "num_tokens": 690655483.0, "step": 18099 }, { "epoch": 2.302506042488233, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.28965759277344, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8602637648582458, "num_tokens": 690694133.0, "step": 18100 }, { "epoch": 2.3026332527668236, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.98828887939453, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8400605916976929, "num_tokens": 690729939.0, "step": 18101 }, { "epoch": 2.302760463045414, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.7593879699707, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8680124282836914, "num_tokens": 690770139.0, "step": 18102 }, { "epoch": 2.3028876733240047, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.01117706298828, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8647403120994568, "num_tokens": 690811379.0, "step": 18103 }, { "epoch": 2.303014883602595, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.48102569580078, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8583108186721802, "num_tokens": 690844194.0, "step": 18104 }, { "epoch": 2.3031420938811857, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.867401123046875, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8560944199562073, "num_tokens": 690882129.0, "step": 18105 }, { "epoch": 2.3032693041597763, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.38443374633789, "learning_rate": 1e-06, "loss": 0.5533, "mean_token_accuracy": 0.8749593496322632, "num_tokens": 690915568.0, "step": 18106 }, { "epoch": 2.303396514438367, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.928611755371094, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8589124083518982, "num_tokens": 690953412.0, "step": 18107 }, { "epoch": 2.3035237247169573, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.37689208984375, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.860654890537262, "num_tokens": 690990047.0, "step": 18108 }, { "epoch": 2.303650934995548, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.940155029296875, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8680164813995361, "num_tokens": 691031010.0, "step": 18109 }, { "epoch": 2.303778145274138, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.6302490234375, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8655537962913513, "num_tokens": 691065830.0, "step": 18110 }, { "epoch": 2.303905355552729, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.66522979736328, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8736499547958374, "num_tokens": 691101418.0, "step": 18111 }, { "epoch": 2.304032565831319, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.50470733642578, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8459070324897766, "num_tokens": 691139447.0, "step": 18112 }, { "epoch": 2.3041597761099095, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.81668472290039, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.841977596282959, "num_tokens": 691168809.0, "step": 18113 }, { "epoch": 2.3042869863885, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.09273147583008, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8663188815116882, "num_tokens": 691200420.0, "step": 18114 }, { "epoch": 2.3044141966670906, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.25529861450195, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8575841188430786, "num_tokens": 691238587.0, "step": 18115 }, { "epoch": 2.304541406945681, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.12452697753906, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8597683310508728, "num_tokens": 691277137.0, "step": 18116 }, { "epoch": 2.3046686172242716, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.027774810791016, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8590661287307739, "num_tokens": 691319564.0, "step": 18117 }, { "epoch": 2.304795827502862, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.22744369506836, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8601897358894348, "num_tokens": 691359652.0, "step": 18118 }, { "epoch": 2.3049230377814527, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.75294494628906, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8631685376167297, "num_tokens": 691399705.0, "step": 18119 }, { "epoch": 2.305050248060043, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.96719741821289, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.854268491268158, "num_tokens": 691439447.0, "step": 18120 }, { "epoch": 2.3051774583386337, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.994022369384766, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8500680923461914, "num_tokens": 691477453.0, "step": 18121 }, { "epoch": 2.3053046686172243, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.355621337890625, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8632979393005371, "num_tokens": 691517766.0, "step": 18122 }, { "epoch": 2.305431878895815, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.00640869140625, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8601226210594177, "num_tokens": 691553956.0, "step": 18123 }, { "epoch": 2.3055590891744053, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.45347213745117, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8603304028511047, "num_tokens": 691592522.0, "step": 18124 }, { "epoch": 2.305686299452996, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.55287551879883, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8612897396087646, "num_tokens": 691629202.0, "step": 18125 }, { "epoch": 2.3058135097315864, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.37261199951172, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.864532470703125, "num_tokens": 691665324.0, "step": 18126 }, { "epoch": 2.305940720010177, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.63154602050781, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8721590042114258, "num_tokens": 691704223.0, "step": 18127 }, { "epoch": 2.3060679302887674, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.23544692993164, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8589922189712524, "num_tokens": 691745061.0, "step": 18128 }, { "epoch": 2.306195140567358, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.85953140258789, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8729902505874634, "num_tokens": 691783630.0, "step": 18129 }, { "epoch": 2.3063223508459485, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.31350326538086, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.860991895198822, "num_tokens": 691818283.0, "step": 18130 }, { "epoch": 2.306449561124539, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.74901580810547, "learning_rate": 1e-06, "loss": 0.6254, "mean_token_accuracy": 0.8507983684539795, "num_tokens": 691854585.0, "step": 18131 }, { "epoch": 2.3065767714031296, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.4639778137207, "learning_rate": 1e-06, "loss": 0.6475, "mean_token_accuracy": 0.8441048860549927, "num_tokens": 691891784.0, "step": 18132 }, { "epoch": 2.3067039816817196, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.46323013305664, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.863994836807251, "num_tokens": 691927389.0, "step": 18133 }, { "epoch": 2.3068311919603106, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.70093536376953, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.857912540435791, "num_tokens": 691962928.0, "step": 18134 }, { "epoch": 2.3069584022389007, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.5588264465332, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8601391315460205, "num_tokens": 692007064.0, "step": 18135 }, { "epoch": 2.3070856125174912, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.23857116699219, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.866934597492218, "num_tokens": 692040668.0, "step": 18136 }, { "epoch": 2.3072128227960818, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.66154479980469, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.862170934677124, "num_tokens": 692077710.0, "step": 18137 }, { "epoch": 2.3073400330746723, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.29629898071289, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8695600628852844, "num_tokens": 692112316.0, "step": 18138 }, { "epoch": 2.307467243353263, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.87421417236328, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8665826320648193, "num_tokens": 692153201.0, "step": 18139 }, { "epoch": 2.3075944536318533, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.96885299682617, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8548601269721985, "num_tokens": 692192942.0, "step": 18140 }, { "epoch": 2.307721663910444, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.008033752441406, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8781654834747314, "num_tokens": 692235515.0, "step": 18141 }, { "epoch": 2.3078488741890344, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.121673583984375, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8514768481254578, "num_tokens": 692271338.0, "step": 18142 }, { "epoch": 2.307976084467625, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.5677375793457, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8427655696868896, "num_tokens": 692311469.0, "step": 18143 }, { "epoch": 2.3081032947462155, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.906795501708984, "learning_rate": 1e-06, "loss": 0.6589, "mean_token_accuracy": 0.841764509677887, "num_tokens": 692352052.0, "step": 18144 }, { "epoch": 2.308230505024806, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.52698516845703, "learning_rate": 1e-06, "loss": 0.6461, "mean_token_accuracy": 0.8444241285324097, "num_tokens": 692382978.0, "step": 18145 }, { "epoch": 2.3083577153033965, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.21458053588867, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8616922497749329, "num_tokens": 692415651.0, "step": 18146 }, { "epoch": 2.308484925581987, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.35284423828125, "learning_rate": 1e-06, "loss": 0.6532, "mean_token_accuracy": 0.8433322310447693, "num_tokens": 692455937.0, "step": 18147 }, { "epoch": 2.3086121358605776, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.91572952270508, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8574496507644653, "num_tokens": 692496277.0, "step": 18148 }, { "epoch": 2.308739346139168, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.33198928833008, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8626972436904907, "num_tokens": 692530485.0, "step": 18149 }, { "epoch": 2.3088665564177586, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.882816314697266, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8584204912185669, "num_tokens": 692564794.0, "step": 18150 }, { "epoch": 2.308993766696349, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.92036437988281, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8512253761291504, "num_tokens": 692610359.0, "step": 18151 }, { "epoch": 2.3091209769749397, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.79894256591797, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8694794178009033, "num_tokens": 692641560.0, "step": 18152 }, { "epoch": 2.30924818725353, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.561912536621094, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8583462834358215, "num_tokens": 692680114.0, "step": 18153 }, { "epoch": 2.3093753975321207, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.51211929321289, "learning_rate": 1e-06, "loss": 0.6488, "mean_token_accuracy": 0.8474250435829163, "num_tokens": 692714199.0, "step": 18154 }, { "epoch": 2.3095026078107113, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.82600784301758, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8644673824310303, "num_tokens": 692752331.0, "step": 18155 }, { "epoch": 2.309629818089302, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.996150970458984, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8675135374069214, "num_tokens": 692789167.0, "step": 18156 }, { "epoch": 2.3097570283678923, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.47742462158203, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.861668586730957, "num_tokens": 692829234.0, "step": 18157 }, { "epoch": 2.3098842386464824, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.352840423583984, "learning_rate": 1e-06, "loss": 0.6651, "mean_token_accuracy": 0.8387711644172668, "num_tokens": 692863749.0, "step": 18158 }, { "epoch": 2.3100114489250734, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.28349685668945, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.858338475227356, "num_tokens": 692900972.0, "step": 18159 }, { "epoch": 2.3101386592036635, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.54826736450195, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8552159667015076, "num_tokens": 692938522.0, "step": 18160 }, { "epoch": 2.310265869482254, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.193016052246094, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8696761131286621, "num_tokens": 692973653.0, "step": 18161 }, { "epoch": 2.3103930797608445, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.46369934082031, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8524256944656372, "num_tokens": 693016325.0, "step": 18162 }, { "epoch": 2.310520290039435, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.27227783203125, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8723165392875671, "num_tokens": 693052022.0, "step": 18163 }, { "epoch": 2.3106475003180256, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.28272247314453, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8646388053894043, "num_tokens": 693093936.0, "step": 18164 }, { "epoch": 2.310774710596616, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.15713882446289, "learning_rate": 1e-06, "loss": 0.5469, "mean_token_accuracy": 0.876106858253479, "num_tokens": 693130160.0, "step": 18165 }, { "epoch": 2.3109019208752066, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.44126892089844, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8634047508239746, "num_tokens": 693172405.0, "step": 18166 }, { "epoch": 2.311029131153797, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.25141525268555, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8642960786819458, "num_tokens": 693206499.0, "step": 18167 }, { "epoch": 2.3111563414323877, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.32999038696289, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8610103726387024, "num_tokens": 693244287.0, "step": 18168 }, { "epoch": 2.311283551710978, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.14542770385742, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8536617755889893, "num_tokens": 693286164.0, "step": 18169 }, { "epoch": 2.3114107619895687, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.42745590209961, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8686977624893188, "num_tokens": 693330961.0, "step": 18170 }, { "epoch": 2.3115379722681593, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.61045455932617, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8657917976379395, "num_tokens": 693375169.0, "step": 18171 }, { "epoch": 2.31166518254675, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.238548278808594, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8558980226516724, "num_tokens": 693414127.0, "step": 18172 }, { "epoch": 2.3117923928253403, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.919281005859375, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8590410947799683, "num_tokens": 693450446.0, "step": 18173 }, { "epoch": 2.311919603103931, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.721370697021484, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8510487079620361, "num_tokens": 693490973.0, "step": 18174 }, { "epoch": 2.3120468133825214, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.73456954956055, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8654347062110901, "num_tokens": 693526815.0, "step": 18175 }, { "epoch": 2.312174023661112, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.706275939941406, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8536649942398071, "num_tokens": 693566949.0, "step": 18176 }, { "epoch": 2.3123012339397024, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.38099670410156, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.853227972984314, "num_tokens": 693601116.0, "step": 18177 }, { "epoch": 2.312428444218293, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.339324951171875, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8923177123069763, "num_tokens": 693640588.0, "step": 18178 }, { "epoch": 2.3125556544968835, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.39559555053711, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8619333505630493, "num_tokens": 693683352.0, "step": 18179 }, { "epoch": 2.312682864775474, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.61091995239258, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8657363057136536, "num_tokens": 693722579.0, "step": 18180 }, { "epoch": 2.3128100750540646, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.43461227416992, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8627459406852722, "num_tokens": 693766743.0, "step": 18181 }, { "epoch": 2.312937285332655, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.510765075683594, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8636640310287476, "num_tokens": 693809699.0, "step": 18182 }, { "epoch": 2.313064495611245, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.084693908691406, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8664904236793518, "num_tokens": 693845508.0, "step": 18183 }, { "epoch": 2.313191705889836, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.46102523803711, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8597753047943115, "num_tokens": 693883763.0, "step": 18184 }, { "epoch": 2.313318916168426, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.934410095214844, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8654052019119263, "num_tokens": 693917093.0, "step": 18185 }, { "epoch": 2.3134461264470167, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.45661163330078, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8619120121002197, "num_tokens": 693949219.0, "step": 18186 }, { "epoch": 2.3135733367256073, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 48.48073959350586, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.855322003364563, "num_tokens": 693992755.0, "step": 18187 }, { "epoch": 2.313700547004198, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.216854095458984, "learning_rate": 1e-06, "loss": 0.6584, "mean_token_accuracy": 0.8473861217498779, "num_tokens": 694030386.0, "step": 18188 }, { "epoch": 2.3138277572827883, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.92851257324219, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.87012779712677, "num_tokens": 694065019.0, "step": 18189 }, { "epoch": 2.313954967561379, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.49342346191406, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8584422469139099, "num_tokens": 694106645.0, "step": 18190 }, { "epoch": 2.3140821778399694, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.05577850341797, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8666465282440186, "num_tokens": 694149146.0, "step": 18191 }, { "epoch": 2.31420938811856, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.42641067504883, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8458997011184692, "num_tokens": 694193567.0, "step": 18192 }, { "epoch": 2.3143365983971504, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.10770034790039, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8734229207038879, "num_tokens": 694232160.0, "step": 18193 }, { "epoch": 2.314463808675741, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.29060363769531, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8623440861701965, "num_tokens": 694275153.0, "step": 18194 }, { "epoch": 2.3145910189543315, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.2392692565918, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8676906824111938, "num_tokens": 694313722.0, "step": 18195 }, { "epoch": 2.314718229232922, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.63371658325195, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8655620217323303, "num_tokens": 694350544.0, "step": 18196 }, { "epoch": 2.3148454395115126, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.80113983154297, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8629895448684692, "num_tokens": 694386552.0, "step": 18197 }, { "epoch": 2.314972649790103, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.9248161315918, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8577557802200317, "num_tokens": 694434368.0, "step": 18198 }, { "epoch": 2.3150998600686936, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.752899169921875, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8612725734710693, "num_tokens": 694479856.0, "step": 18199 }, { "epoch": 2.315227070347284, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.87111282348633, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.862185001373291, "num_tokens": 694521107.0, "step": 18200 }, { "epoch": 2.3153542806258747, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.18315887451172, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8613138794898987, "num_tokens": 694560886.0, "step": 18201 }, { "epoch": 2.315481490904465, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.76582717895508, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8669790625572205, "num_tokens": 694598404.0, "step": 18202 }, { "epoch": 2.3156087011830557, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.84377670288086, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8561615943908691, "num_tokens": 694637162.0, "step": 18203 }, { "epoch": 2.3157359114616463, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.18754959106445, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8650064468383789, "num_tokens": 694677435.0, "step": 18204 }, { "epoch": 2.315863121740237, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.55543518066406, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8715125322341919, "num_tokens": 694713980.0, "step": 18205 }, { "epoch": 2.3159903320188273, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.186580657958984, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8571658730506897, "num_tokens": 694753884.0, "step": 18206 }, { "epoch": 2.316117542297418, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.532249450683594, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8750383257865906, "num_tokens": 694785739.0, "step": 18207 }, { "epoch": 2.316244752576008, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.09981918334961, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8603593111038208, "num_tokens": 694827629.0, "step": 18208 }, { "epoch": 2.316371962854599, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.54063415527344, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8578562140464783, "num_tokens": 694868241.0, "step": 18209 }, { "epoch": 2.316499173133189, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.084007263183594, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8514214754104614, "num_tokens": 694905298.0, "step": 18210 }, { "epoch": 2.3166263834117795, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.40331268310547, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8531354665756226, "num_tokens": 694938054.0, "step": 18211 }, { "epoch": 2.31675359369037, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.44435501098633, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8557873368263245, "num_tokens": 694978761.0, "step": 18212 }, { "epoch": 2.3168808039689606, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 47.72025680541992, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8570137619972229, "num_tokens": 695015073.0, "step": 18213 }, { "epoch": 2.317008014247551, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.601383209228516, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8556115031242371, "num_tokens": 695052288.0, "step": 18214 }, { "epoch": 2.3171352245261416, "ewc_loss": 0.1845703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016117095947265625, "grad_norm": 47.776397705078125, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8502322435379028, "num_tokens": 695088976.0, "step": 18215 }, { "epoch": 2.317262434804732, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.49724197387695, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8657882213592529, "num_tokens": 695128482.0, "step": 18216 }, { "epoch": 2.3173896450833227, "ewc_loss": 0.185546875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001621246337890625, "grad_norm": 47.910888671875, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.8582308292388916, "num_tokens": 695164963.0, "step": 18217 }, { "epoch": 2.317516855361913, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.56711196899414, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8826360702514648, "num_tokens": 695201515.0, "step": 18218 }, { "epoch": 2.3176440656405037, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.00087356567383, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8653191328048706, "num_tokens": 695242057.0, "step": 18219 }, { "epoch": 2.3177712759190943, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.2075080871582, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.863070011138916, "num_tokens": 695277128.0, "step": 18220 }, { "epoch": 2.317898486197685, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.102088928222656, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8754115104675293, "num_tokens": 695311984.0, "step": 18221 }, { "epoch": 2.3180256964762753, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.07075881958008, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8494604825973511, "num_tokens": 695348457.0, "step": 18222 }, { "epoch": 2.318152906754866, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.45339584350586, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8633471131324768, "num_tokens": 695386518.0, "step": 18223 }, { "epoch": 2.3182801170334564, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.43966293334961, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8582009673118591, "num_tokens": 695425365.0, "step": 18224 }, { "epoch": 2.318407327312047, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.04116439819336, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8631642460823059, "num_tokens": 695459559.0, "step": 18225 }, { "epoch": 2.3185345375906374, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.23609161376953, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8549126386642456, "num_tokens": 695491386.0, "step": 18226 }, { "epoch": 2.318661747869228, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.91572952270508, "learning_rate": 1e-06, "loss": 0.6583, "mean_token_accuracy": 0.8448973894119263, "num_tokens": 695534660.0, "step": 18227 }, { "epoch": 2.3187889581478185, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.29340362548828, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8540506362915039, "num_tokens": 695578127.0, "step": 18228 }, { "epoch": 2.318916168426409, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.23517608642578, "learning_rate": 1e-06, "loss": 0.6435, "mean_token_accuracy": 0.8485798835754395, "num_tokens": 695614181.0, "step": 18229 }, { "epoch": 2.3190433787049995, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.75101089477539, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.859647810459137, "num_tokens": 695652753.0, "step": 18230 }, { "epoch": 2.3191705889835896, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.1002311706543, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8686641454696655, "num_tokens": 695692449.0, "step": 18231 }, { "epoch": 2.3192977992621806, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.57087326049805, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8626441955566406, "num_tokens": 695730812.0, "step": 18232 }, { "epoch": 2.3194250095407707, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.1071891784668, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8755502104759216, "num_tokens": 695767577.0, "step": 18233 }, { "epoch": 2.319552219819361, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 49.02416229248047, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8609127998352051, "num_tokens": 695812372.0, "step": 18234 }, { "epoch": 2.3196794300979517, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.733184814453125, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.8781232833862305, "num_tokens": 695847470.0, "step": 18235 }, { "epoch": 2.3198066403765423, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.410980224609375, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8787421584129333, "num_tokens": 695888082.0, "step": 18236 }, { "epoch": 2.319933850655133, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.3121337890625, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8686371445655823, "num_tokens": 695928904.0, "step": 18237 }, { "epoch": 2.3200610609337233, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.16702651977539, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8492556214332581, "num_tokens": 695967549.0, "step": 18238 }, { "epoch": 2.320188271212314, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.37318801879883, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8565346002578735, "num_tokens": 696005611.0, "step": 18239 }, { "epoch": 2.3203154814909044, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.133392333984375, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8655582666397095, "num_tokens": 696034864.0, "step": 18240 }, { "epoch": 2.320442691769495, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.41094207763672, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8583446145057678, "num_tokens": 696072137.0, "step": 18241 }, { "epoch": 2.3205699020480854, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.24881362915039, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8651537895202637, "num_tokens": 696115744.0, "step": 18242 }, { "epoch": 2.320697112326676, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.128517150878906, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8573694825172424, "num_tokens": 696151293.0, "step": 18243 }, { "epoch": 2.3208243226052665, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.58208465576172, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.880562424659729, "num_tokens": 696190742.0, "step": 18244 }, { "epoch": 2.320951532883857, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.31155014038086, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8506186604499817, "num_tokens": 696232807.0, "step": 18245 }, { "epoch": 2.3210787431624476, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.510231018066406, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.858485758304596, "num_tokens": 696269416.0, "step": 18246 }, { "epoch": 2.321205953441038, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.566375732421875, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8638984560966492, "num_tokens": 696305144.0, "step": 18247 }, { "epoch": 2.3213331637196286, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.95718765258789, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8530042767524719, "num_tokens": 696344854.0, "step": 18248 }, { "epoch": 2.321460373998219, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.489051818847656, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8796194195747375, "num_tokens": 696380154.0, "step": 18249 }, { "epoch": 2.3215875842768097, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.2744255065918, "learning_rate": 1e-06, "loss": 0.6442, "mean_token_accuracy": 0.8458296060562134, "num_tokens": 696416458.0, "step": 18250 }, { "epoch": 2.3217147945554, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.9124870300293, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8613483905792236, "num_tokens": 696453232.0, "step": 18251 }, { "epoch": 2.3218420048339907, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.6589469909668, "learning_rate": 1e-06, "loss": 0.6555, "mean_token_accuracy": 0.8400137424468994, "num_tokens": 696487930.0, "step": 18252 }, { "epoch": 2.3219692151125813, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.97523498535156, "learning_rate": 1e-06, "loss": 0.6383, "mean_token_accuracy": 0.8516697883605957, "num_tokens": 696523824.0, "step": 18253 }, { "epoch": 2.322096425391172, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.74043273925781, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.872309148311615, "num_tokens": 696567007.0, "step": 18254 }, { "epoch": 2.3222236356697623, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.375003814697266, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8787920475006104, "num_tokens": 696610055.0, "step": 18255 }, { "epoch": 2.3223508459483524, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.98713684082031, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.8458954095840454, "num_tokens": 696644058.0, "step": 18256 }, { "epoch": 2.3224780562269434, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.22578430175781, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.869442880153656, "num_tokens": 696686381.0, "step": 18257 }, { "epoch": 2.3226052665055335, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.734283447265625, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.857150673866272, "num_tokens": 696727567.0, "step": 18258 }, { "epoch": 2.322732476784124, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.18007278442383, "learning_rate": 1e-06, "loss": 0.6328, "mean_token_accuracy": 0.8520721197128296, "num_tokens": 696767266.0, "step": 18259 }, { "epoch": 2.3228596870627145, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.975425720214844, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8746165633201599, "num_tokens": 696806059.0, "step": 18260 }, { "epoch": 2.322986897341305, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.18778991699219, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8582308888435364, "num_tokens": 696844232.0, "step": 18261 }, { "epoch": 2.3231141076198956, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.631103515625, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8641602396965027, "num_tokens": 696886223.0, "step": 18262 }, { "epoch": 2.323241317898486, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.52804183959961, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8474299907684326, "num_tokens": 696922625.0, "step": 18263 }, { "epoch": 2.3233685281770766, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.19063186645508, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8499442338943481, "num_tokens": 696965160.0, "step": 18264 }, { "epoch": 2.323495738455667, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.909202575683594, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8680292367935181, "num_tokens": 697000854.0, "step": 18265 }, { "epoch": 2.3236229487342577, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.47289276123047, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.877054750919342, "num_tokens": 697041264.0, "step": 18266 }, { "epoch": 2.323750159012848, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.01014709472656, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8590014576911926, "num_tokens": 697080079.0, "step": 18267 }, { "epoch": 2.3238773692914387, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.308685302734375, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8580158948898315, "num_tokens": 697123755.0, "step": 18268 }, { "epoch": 2.3240045795700293, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.56059646606445, "learning_rate": 1e-06, "loss": 0.6995, "mean_token_accuracy": 0.8320128917694092, "num_tokens": 697162012.0, "step": 18269 }, { "epoch": 2.32413178984862, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.779144287109375, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8709529042243958, "num_tokens": 697200188.0, "step": 18270 }, { "epoch": 2.3242590001272103, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.0223274230957, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8643526434898376, "num_tokens": 697240536.0, "step": 18271 }, { "epoch": 2.324386210405801, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.050045013427734, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8722963333129883, "num_tokens": 697279459.0, "step": 18272 }, { "epoch": 2.3245134206843914, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.0789794921875, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8797937631607056, "num_tokens": 697312594.0, "step": 18273 }, { "epoch": 2.324640630962982, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.21128845214844, "learning_rate": 1e-06, "loss": 0.649, "mean_token_accuracy": 0.8467568159103394, "num_tokens": 697357834.0, "step": 18274 }, { "epoch": 2.3247678412415724, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.44865798950195, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8599490523338318, "num_tokens": 697401522.0, "step": 18275 }, { "epoch": 2.324895051520163, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.605247497558594, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8584003448486328, "num_tokens": 697440415.0, "step": 18276 }, { "epoch": 2.3250222617987535, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.43446731567383, "learning_rate": 1e-06, "loss": 0.649, "mean_token_accuracy": 0.8441663980484009, "num_tokens": 697477609.0, "step": 18277 }, { "epoch": 2.325149472077344, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.676544189453125, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8566602468490601, "num_tokens": 697515566.0, "step": 18278 }, { "epoch": 2.3252766823559345, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.456199645996094, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8650997877120972, "num_tokens": 697549537.0, "step": 18279 }, { "epoch": 2.325403892634525, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.256744384765625, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8651583194732666, "num_tokens": 697582618.0, "step": 18280 }, { "epoch": 2.325531102913115, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.38542556762695, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8752585053443909, "num_tokens": 697622234.0, "step": 18281 }, { "epoch": 2.325658313191706, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.057861328125, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8532359600067139, "num_tokens": 697657504.0, "step": 18282 }, { "epoch": 2.325785523470296, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.29588317871094, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8481590151786804, "num_tokens": 697699496.0, "step": 18283 }, { "epoch": 2.3259127337488867, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.951316833496094, "learning_rate": 1e-06, "loss": 0.5551, "mean_token_accuracy": 0.8754433393478394, "num_tokens": 697739969.0, "step": 18284 }, { "epoch": 2.3260399440274773, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.46376037597656, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8493942618370056, "num_tokens": 697774346.0, "step": 18285 }, { "epoch": 2.326167154306068, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.156455993652344, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.8452630043029785, "num_tokens": 697813013.0, "step": 18286 }, { "epoch": 2.3262943645846583, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.15464401245117, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8551920056343079, "num_tokens": 697854163.0, "step": 18287 }, { "epoch": 2.326421574863249, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.49232864379883, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8703035712242126, "num_tokens": 697898049.0, "step": 18288 }, { "epoch": 2.3265487851418394, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.01774597167969, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8644612431526184, "num_tokens": 697936086.0, "step": 18289 }, { "epoch": 2.32667599542043, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.685813903808594, "learning_rate": 1e-06, "loss": 0.7024, "mean_token_accuracy": 0.8367695212364197, "num_tokens": 697981529.0, "step": 18290 }, { "epoch": 2.3268032056990204, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.300113677978516, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8775471448898315, "num_tokens": 698016047.0, "step": 18291 }, { "epoch": 2.326930415977611, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.65167236328125, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8735291957855225, "num_tokens": 698052408.0, "step": 18292 }, { "epoch": 2.3270576262562015, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.05472183227539, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8539350628852844, "num_tokens": 698089751.0, "step": 18293 }, { "epoch": 2.327184836534792, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.78355026245117, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8589305281639099, "num_tokens": 698121274.0, "step": 18294 }, { "epoch": 2.3273120468133826, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.81838607788086, "learning_rate": 1e-06, "loss": 0.565, "mean_token_accuracy": 0.871637225151062, "num_tokens": 698155880.0, "step": 18295 }, { "epoch": 2.327439257091973, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.72169876098633, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8650338649749756, "num_tokens": 698191448.0, "step": 18296 }, { "epoch": 2.3275664673705636, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.194580078125, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8554314374923706, "num_tokens": 698228437.0, "step": 18297 }, { "epoch": 2.327693677649154, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.70241165161133, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8530990481376648, "num_tokens": 698268714.0, "step": 18298 }, { "epoch": 2.3278208879277447, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.391170501708984, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8615133762359619, "num_tokens": 698305575.0, "step": 18299 }, { "epoch": 2.327948098206335, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.56977462768555, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8586838245391846, "num_tokens": 698347701.0, "step": 18300 }, { "epoch": 2.3280753084849257, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.296993255615234, "learning_rate": 1e-06, "loss": 0.5617, "mean_token_accuracy": 0.8762410879135132, "num_tokens": 698385906.0, "step": 18301 }, { "epoch": 2.3282025187635162, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.949886322021484, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.860750138759613, "num_tokens": 698415511.0, "step": 18302 }, { "epoch": 2.3283297290421068, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.79168701171875, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.857245922088623, "num_tokens": 698460528.0, "step": 18303 }, { "epoch": 2.3284569393206973, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.12916564941406, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8464844822883606, "num_tokens": 698500800.0, "step": 18304 }, { "epoch": 2.328584149599288, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.44308090209961, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8606491088867188, "num_tokens": 698539373.0, "step": 18305 }, { "epoch": 2.328711359877878, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.361167907714844, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8827353715896606, "num_tokens": 698577067.0, "step": 18306 }, { "epoch": 2.328838570156469, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.73360061645508, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8602054119110107, "num_tokens": 698613515.0, "step": 18307 }, { "epoch": 2.328965780435059, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.34025955200195, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8624579310417175, "num_tokens": 698647278.0, "step": 18308 }, { "epoch": 2.3290929907136495, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.005985260009766, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8689854145050049, "num_tokens": 698688738.0, "step": 18309 }, { "epoch": 2.32922020099224, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.310916900634766, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8657523393630981, "num_tokens": 698725003.0, "step": 18310 }, { "epoch": 2.3293474112708306, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.81895065307617, "learning_rate": 1e-06, "loss": 0.6505, "mean_token_accuracy": 0.843612551689148, "num_tokens": 698768205.0, "step": 18311 }, { "epoch": 2.329474621549421, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.113433837890625, "learning_rate": 1e-06, "loss": 0.6639, "mean_token_accuracy": 0.8409197330474854, "num_tokens": 698805352.0, "step": 18312 }, { "epoch": 2.3296018318280116, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.97206497192383, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8717191815376282, "num_tokens": 698836610.0, "step": 18313 }, { "epoch": 2.329729042106602, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.13071060180664, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8701670169830322, "num_tokens": 698877514.0, "step": 18314 }, { "epoch": 2.3298562523851927, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.98574447631836, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8645352125167847, "num_tokens": 698916699.0, "step": 18315 }, { "epoch": 2.329983462663783, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.130855560302734, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.86252361536026, "num_tokens": 698953568.0, "step": 18316 }, { "epoch": 2.3301106729423737, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.744651794433594, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8569212555885315, "num_tokens": 698990562.0, "step": 18317 }, { "epoch": 2.3302378832209643, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.88859939575195, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8640791177749634, "num_tokens": 699024095.0, "step": 18318 }, { "epoch": 2.330365093499555, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.123050689697266, "learning_rate": 1e-06, "loss": 0.6629, "mean_token_accuracy": 0.8403425216674805, "num_tokens": 699062327.0, "step": 18319 }, { "epoch": 2.3304923037781453, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.89250946044922, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.866346001625061, "num_tokens": 699101591.0, "step": 18320 }, { "epoch": 2.330619514056736, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.979087829589844, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8518933653831482, "num_tokens": 699134836.0, "step": 18321 }, { "epoch": 2.3307467243353264, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.772613525390625, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8518266677856445, "num_tokens": 699171218.0, "step": 18322 }, { "epoch": 2.330873934613917, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.72029113769531, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8780956268310547, "num_tokens": 699200891.0, "step": 18323 }, { "epoch": 2.3310011448925074, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.93817138671875, "learning_rate": 1e-06, "loss": 0.5497, "mean_token_accuracy": 0.8775678277015686, "num_tokens": 699238080.0, "step": 18324 }, { "epoch": 2.331128355171098, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.5416374206543, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8492115139961243, "num_tokens": 699276051.0, "step": 18325 }, { "epoch": 2.3312555654496885, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.095455169677734, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.8699581623077393, "num_tokens": 699315721.0, "step": 18326 }, { "epoch": 2.331382775728279, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.51007080078125, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.872706949710846, "num_tokens": 699350342.0, "step": 18327 }, { "epoch": 2.3315099860068695, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.106910705566406, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8671282529830933, "num_tokens": 699390301.0, "step": 18328 }, { "epoch": 2.3316371962854596, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 47.97780227661133, "learning_rate": 1e-06, "loss": 0.641, "mean_token_accuracy": 0.8461668491363525, "num_tokens": 699427135.0, "step": 18329 }, { "epoch": 2.3317644065640506, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.344200134277344, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8675830960273743, "num_tokens": 699466461.0, "step": 18330 }, { "epoch": 2.3318916168426407, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.14289855957031, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8559209704399109, "num_tokens": 699502446.0, "step": 18331 }, { "epoch": 2.332018827121231, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.30777359008789, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8612241148948669, "num_tokens": 699546357.0, "step": 18332 }, { "epoch": 2.3321460373998217, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.19779968261719, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8613681793212891, "num_tokens": 699581879.0, "step": 18333 }, { "epoch": 2.3322732476784123, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 47.966697692871094, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8754901885986328, "num_tokens": 699616287.0, "step": 18334 }, { "epoch": 2.332400457957003, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.351070404052734, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.859778881072998, "num_tokens": 699655409.0, "step": 18335 }, { "epoch": 2.3325276682355933, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 47.91847229003906, "learning_rate": 1e-06, "loss": 0.6741, "mean_token_accuracy": 0.8360214233398438, "num_tokens": 699701420.0, "step": 18336 }, { "epoch": 2.332654878514184, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.61028289794922, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8637881278991699, "num_tokens": 699736151.0, "step": 18337 }, { "epoch": 2.3327820887927744, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 47.63439178466797, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8533448576927185, "num_tokens": 699770111.0, "step": 18338 }, { "epoch": 2.332909299071365, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.7432861328125, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8589173555374146, "num_tokens": 699807442.0, "step": 18339 }, { "epoch": 2.3330365093499554, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.60022735595703, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8596996665000916, "num_tokens": 699849364.0, "step": 18340 }, { "epoch": 2.333163719628546, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.53806686401367, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8748771548271179, "num_tokens": 699886234.0, "step": 18341 }, { "epoch": 2.3332909299071365, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.034873962402344, "learning_rate": 1e-06, "loss": 0.6449, "mean_token_accuracy": 0.8437198400497437, "num_tokens": 699925660.0, "step": 18342 }, { "epoch": 2.333418140185727, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.591705322265625, "learning_rate": 1e-06, "loss": 0.6921, "mean_token_accuracy": 0.8358079195022583, "num_tokens": 699967894.0, "step": 18343 }, { "epoch": 2.3335453504643175, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.648658752441406, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.853727400302887, "num_tokens": 699999211.0, "step": 18344 }, { "epoch": 2.333672560742908, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.8761100769043, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8692190051078796, "num_tokens": 700037639.0, "step": 18345 }, { "epoch": 2.3337997710214986, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.73561096191406, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8537938594818115, "num_tokens": 700075677.0, "step": 18346 }, { "epoch": 2.333926981300089, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.505516052246094, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8682992458343506, "num_tokens": 700115697.0, "step": 18347 }, { "epoch": 2.3340541915786797, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 47.7156867980957, "learning_rate": 1e-06, "loss": 0.6514, "mean_token_accuracy": 0.8440432548522949, "num_tokens": 700152553.0, "step": 18348 }, { "epoch": 2.33418140185727, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.988983154296875, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8670703172683716, "num_tokens": 700190958.0, "step": 18349 }, { "epoch": 2.3343086121358607, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.545040130615234, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8579208254814148, "num_tokens": 700224244.0, "step": 18350 }, { "epoch": 2.3344358224144512, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.6182746887207, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8886310458183289, "num_tokens": 700259736.0, "step": 18351 }, { "epoch": 2.3345630326930418, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.36487579345703, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8618711829185486, "num_tokens": 700293624.0, "step": 18352 }, { "epoch": 2.3346902429716323, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.02248001098633, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8649299144744873, "num_tokens": 700328052.0, "step": 18353 }, { "epoch": 2.3348174532502224, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.41616439819336, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8536593317985535, "num_tokens": 700364652.0, "step": 18354 }, { "epoch": 2.3349446635288134, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.57948303222656, "learning_rate": 1e-06, "loss": 0.5599, "mean_token_accuracy": 0.8741970062255859, "num_tokens": 700403698.0, "step": 18355 }, { "epoch": 2.3350718738074034, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.91468811035156, "learning_rate": 1e-06, "loss": 0.5537, "mean_token_accuracy": 0.8755602836608887, "num_tokens": 700439391.0, "step": 18356 }, { "epoch": 2.335199084085994, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.719093322753906, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8634711503982544, "num_tokens": 700475095.0, "step": 18357 }, { "epoch": 2.3353262943645845, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.59470748901367, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8659734725952148, "num_tokens": 700514167.0, "step": 18358 }, { "epoch": 2.335453504643175, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.626678466796875, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.8701863884925842, "num_tokens": 700548333.0, "step": 18359 }, { "epoch": 2.3355807149217656, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.640525817871094, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8653585314750671, "num_tokens": 700594261.0, "step": 18360 }, { "epoch": 2.335707925200356, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.986412048339844, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8686679601669312, "num_tokens": 700632513.0, "step": 18361 }, { "epoch": 2.3358351354789466, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.637664794921875, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8679953813552856, "num_tokens": 700672430.0, "step": 18362 }, { "epoch": 2.335962345757537, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.978309631347656, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.868262767791748, "num_tokens": 700714321.0, "step": 18363 }, { "epoch": 2.3360895560361277, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.640323638916016, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8619924783706665, "num_tokens": 700751714.0, "step": 18364 }, { "epoch": 2.336216766314718, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.19454574584961, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8593235015869141, "num_tokens": 700794087.0, "step": 18365 }, { "epoch": 2.3363439765933087, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.569175720214844, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8566850423812866, "num_tokens": 700832031.0, "step": 18366 }, { "epoch": 2.3364711868718993, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.53457260131836, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8631469011306763, "num_tokens": 700867175.0, "step": 18367 }, { "epoch": 2.33659839715049, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.153934478759766, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8687130212783813, "num_tokens": 700906762.0, "step": 18368 }, { "epoch": 2.3367256074290803, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.13817596435547, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.864330530166626, "num_tokens": 700944316.0, "step": 18369 }, { "epoch": 2.336852817707671, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.443416595458984, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.859484851360321, "num_tokens": 700979369.0, "step": 18370 }, { "epoch": 2.3369800279862614, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.921485900878906, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8592867851257324, "num_tokens": 701016213.0, "step": 18371 }, { "epoch": 2.337107238264852, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.54002380371094, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8588558435440063, "num_tokens": 701055270.0, "step": 18372 }, { "epoch": 2.3372344485434424, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 47.877288818359375, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8593208193778992, "num_tokens": 701091058.0, "step": 18373 }, { "epoch": 2.337361658822033, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.444236755371094, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8694793581962585, "num_tokens": 701129446.0, "step": 18374 }, { "epoch": 2.3374888691006235, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.167236328125, "learning_rate": 1e-06, "loss": 0.5795, "mean_token_accuracy": 0.8662654161453247, "num_tokens": 701168726.0, "step": 18375 }, { "epoch": 2.337616079379214, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.206298828125, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8565105199813843, "num_tokens": 701204041.0, "step": 18376 }, { "epoch": 2.3377432896578045, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.6275520324707, "learning_rate": 1e-06, "loss": 0.6684, "mean_token_accuracy": 0.8398176431655884, "num_tokens": 701244660.0, "step": 18377 }, { "epoch": 2.337870499936395, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.021209716796875, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8527219295501709, "num_tokens": 701284830.0, "step": 18378 }, { "epoch": 2.337997710214985, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.48371887207031, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8686105608940125, "num_tokens": 701319720.0, "step": 18379 }, { "epoch": 2.338124920493576, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.91120147705078, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8409971594810486, "num_tokens": 701361770.0, "step": 18380 }, { "epoch": 2.338252130772166, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.44825744628906, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.8601394891738892, "num_tokens": 701399777.0, "step": 18381 }, { "epoch": 2.3383793410507567, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.218017578125, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8599928021430969, "num_tokens": 701432680.0, "step": 18382 }, { "epoch": 2.3385065513293473, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 47.91219711303711, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8723459243774414, "num_tokens": 701474101.0, "step": 18383 }, { "epoch": 2.338633761607938, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.664947509765625, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8746982216835022, "num_tokens": 701517568.0, "step": 18384 }, { "epoch": 2.3387609718865283, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.7657356262207, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8562130928039551, "num_tokens": 701558466.0, "step": 18385 }, { "epoch": 2.338888182165119, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.95441818237305, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8554496765136719, "num_tokens": 701591104.0, "step": 18386 }, { "epoch": 2.3390153924437094, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.82380294799805, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8542789220809937, "num_tokens": 701633242.0, "step": 18387 }, { "epoch": 2.3391426027223, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.00582504272461, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8601675033569336, "num_tokens": 701668864.0, "step": 18388 }, { "epoch": 2.3392698130008904, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.35158920288086, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8633591532707214, "num_tokens": 701703648.0, "step": 18389 }, { "epoch": 2.339397023279481, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.69612503051758, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8574113249778748, "num_tokens": 701744417.0, "step": 18390 }, { "epoch": 2.3395242335580715, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.65950393676758, "learning_rate": 1e-06, "loss": 0.686, "mean_token_accuracy": 0.8286826014518738, "num_tokens": 701783541.0, "step": 18391 }, { "epoch": 2.339651443836662, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.90978240966797, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8577685952186584, "num_tokens": 701825054.0, "step": 18392 }, { "epoch": 2.3397786541152525, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.571346282958984, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8699129819869995, "num_tokens": 701866173.0, "step": 18393 }, { "epoch": 2.339905864393843, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.60518264770508, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8568490743637085, "num_tokens": 701904812.0, "step": 18394 }, { "epoch": 2.3400330746724336, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.74713897705078, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8506240844726562, "num_tokens": 701942590.0, "step": 18395 }, { "epoch": 2.340160284951024, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.379573822021484, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8642112612724304, "num_tokens": 701980773.0, "step": 18396 }, { "epoch": 2.3402874952296147, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.9848518371582, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8723316788673401, "num_tokens": 702014508.0, "step": 18397 }, { "epoch": 2.340414705508205, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.090728759765625, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8732531666755676, "num_tokens": 702055754.0, "step": 18398 }, { "epoch": 2.3405419157867957, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.66746139526367, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8597939610481262, "num_tokens": 702100022.0, "step": 18399 }, { "epoch": 2.3406691260653862, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.302730560302734, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8491542339324951, "num_tokens": 702133217.0, "step": 18400 }, { "epoch": 2.3407963363439768, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.89232635498047, "learning_rate": 1e-06, "loss": 0.5573, "mean_token_accuracy": 0.8749639987945557, "num_tokens": 702176998.0, "step": 18401 }, { "epoch": 2.3409235466225673, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.59192657470703, "learning_rate": 1e-06, "loss": 0.5681, "mean_token_accuracy": 0.8674205541610718, "num_tokens": 702216219.0, "step": 18402 }, { "epoch": 2.341050756901158, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.30718994140625, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8554282784461975, "num_tokens": 702246502.0, "step": 18403 }, { "epoch": 2.341177967179748, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.43010330200195, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.868687629699707, "num_tokens": 702277990.0, "step": 18404 }, { "epoch": 2.341305177458339, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.87696838378906, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8639276027679443, "num_tokens": 702312742.0, "step": 18405 }, { "epoch": 2.341432387736929, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.90312957763672, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8774901032447815, "num_tokens": 702348575.0, "step": 18406 }, { "epoch": 2.3415595980155195, "ewc_loss": 0.1865234375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016307830810546875, "grad_norm": 47.68274688720703, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8499422073364258, "num_tokens": 702385956.0, "step": 18407 }, { "epoch": 2.34168680829411, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.08938217163086, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8663359880447388, "num_tokens": 702423973.0, "step": 18408 }, { "epoch": 2.3418140185727006, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.03151321411133, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8583967685699463, "num_tokens": 702455508.0, "step": 18409 }, { "epoch": 2.341941228851291, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.292545318603516, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8741756081581116, "num_tokens": 702493365.0, "step": 18410 }, { "epoch": 2.3420684391298816, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.400455474853516, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8637415170669556, "num_tokens": 702528036.0, "step": 18411 }, { "epoch": 2.342195649408472, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.03807830810547, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8572736978530884, "num_tokens": 702571238.0, "step": 18412 }, { "epoch": 2.3423228596870627, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.46558380126953, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8567848205566406, "num_tokens": 702611247.0, "step": 18413 }, { "epoch": 2.342450069965653, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.74953079223633, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8631489276885986, "num_tokens": 702649661.0, "step": 18414 }, { "epoch": 2.3425772802442437, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.32547378540039, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8537112474441528, "num_tokens": 702693573.0, "step": 18415 }, { "epoch": 2.3427044905228342, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.37527084350586, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.871498703956604, "num_tokens": 702735351.0, "step": 18416 }, { "epoch": 2.3428317008014248, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.8838005065918, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8661439418792725, "num_tokens": 702773406.0, "step": 18417 }, { "epoch": 2.3429589110800153, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.609188079833984, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8688876032829285, "num_tokens": 702809939.0, "step": 18418 }, { "epoch": 2.343086121358606, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.44292449951172, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8602985143661499, "num_tokens": 702851120.0, "step": 18419 }, { "epoch": 2.3432133316371964, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.73563003540039, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8708599805831909, "num_tokens": 702889492.0, "step": 18420 }, { "epoch": 2.343340541915787, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.39004898071289, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.856303334236145, "num_tokens": 702930486.0, "step": 18421 }, { "epoch": 2.3434677521943774, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.4451904296875, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8542706966400146, "num_tokens": 702969866.0, "step": 18422 }, { "epoch": 2.343594962472968, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.286720275878906, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8528059720993042, "num_tokens": 703013703.0, "step": 18423 }, { "epoch": 2.3437221727515585, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 47.91254425048828, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8515409827232361, "num_tokens": 703054849.0, "step": 18424 }, { "epoch": 2.343849383030149, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.36929702758789, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8554843068122864, "num_tokens": 703088250.0, "step": 18425 }, { "epoch": 2.3439765933087395, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.10799026489258, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8567609190940857, "num_tokens": 703124072.0, "step": 18426 }, { "epoch": 2.3441038035873296, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.64939498901367, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.8445229530334473, "num_tokens": 703158869.0, "step": 18427 }, { "epoch": 2.3442310138659206, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.38753890991211, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8633313775062561, "num_tokens": 703193342.0, "step": 18428 }, { "epoch": 2.3443582241445107, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.68519592285156, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.8735515475273132, "num_tokens": 703229899.0, "step": 18429 }, { "epoch": 2.344485434423101, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.665557861328125, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8606857061386108, "num_tokens": 703266735.0, "step": 18430 }, { "epoch": 2.3446126447016917, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.72712707519531, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8567116260528564, "num_tokens": 703307515.0, "step": 18431 }, { "epoch": 2.3447398549802823, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.85795211791992, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8718810081481934, "num_tokens": 703341237.0, "step": 18432 }, { "epoch": 2.344867065258873, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.47053146362305, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8729057312011719, "num_tokens": 703378900.0, "step": 18433 }, { "epoch": 2.3449942755374633, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.80879211425781, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8701747059822083, "num_tokens": 703415260.0, "step": 18434 }, { "epoch": 2.345121485816054, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.31889724731445, "learning_rate": 1e-06, "loss": 0.5861, "mean_token_accuracy": 0.8684256076812744, "num_tokens": 703452399.0, "step": 18435 }, { "epoch": 2.3452486960946444, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.1063232421875, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8636528253555298, "num_tokens": 703492700.0, "step": 18436 }, { "epoch": 2.345375906373235, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.392879486083984, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8743808269500732, "num_tokens": 703530805.0, "step": 18437 }, { "epoch": 2.3455031166518254, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.339263916015625, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8703778982162476, "num_tokens": 703571894.0, "step": 18438 }, { "epoch": 2.345630326930416, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.57745361328125, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8671152591705322, "num_tokens": 703612583.0, "step": 18439 }, { "epoch": 2.3457575372090065, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.90998077392578, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8541649580001831, "num_tokens": 703649100.0, "step": 18440 }, { "epoch": 2.345884747487597, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.62581253051758, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8765655755996704, "num_tokens": 703680425.0, "step": 18441 }, { "epoch": 2.3460119577661875, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.46004104614258, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8682063817977905, "num_tokens": 703721819.0, "step": 18442 }, { "epoch": 2.346139168044778, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.95210647583008, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8578845858573914, "num_tokens": 703755432.0, "step": 18443 }, { "epoch": 2.3462663783233686, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.49445724487305, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8613036274909973, "num_tokens": 703795903.0, "step": 18444 }, { "epoch": 2.346393588601959, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.79332733154297, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8737032413482666, "num_tokens": 703836074.0, "step": 18445 }, { "epoch": 2.3465207988805497, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.113189697265625, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8677300810813904, "num_tokens": 703870609.0, "step": 18446 }, { "epoch": 2.34664800915914, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.5633430480957, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8669863939285278, "num_tokens": 703909034.0, "step": 18447 }, { "epoch": 2.3467752194377307, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.21573257446289, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8551307320594788, "num_tokens": 703946976.0, "step": 18448 }, { "epoch": 2.3469024297163212, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.95946502685547, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8639673590660095, "num_tokens": 703986379.0, "step": 18449 }, { "epoch": 2.3470296399949118, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.3797721862793, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8798257112503052, "num_tokens": 704019461.0, "step": 18450 }, { "epoch": 2.3471568502735023, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.14259338378906, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8507768511772156, "num_tokens": 704053411.0, "step": 18451 }, { "epoch": 2.3472840605520924, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.11603546142578, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8672478795051575, "num_tokens": 704092633.0, "step": 18452 }, { "epoch": 2.3474112708306833, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.03500747680664, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8665143251419067, "num_tokens": 704126955.0, "step": 18453 }, { "epoch": 2.3475384811092734, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.15536880493164, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8782777786254883, "num_tokens": 704162222.0, "step": 18454 }, { "epoch": 2.347665691387864, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.140220642089844, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8617212772369385, "num_tokens": 704198949.0, "step": 18455 }, { "epoch": 2.3477929016664545, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.393619537353516, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8574272394180298, "num_tokens": 704235327.0, "step": 18456 }, { "epoch": 2.347920111945045, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.107086181640625, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8678397536277771, "num_tokens": 704272783.0, "step": 18457 }, { "epoch": 2.3480473222236355, "ewc_loss": 0.1875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000164031982421875, "grad_norm": 48.11601257324219, "learning_rate": 1e-06, "loss": 0.6555, "mean_token_accuracy": 0.8447578549385071, "num_tokens": 704314396.0, "step": 18458 }, { "epoch": 2.348174532502226, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.87131881713867, "learning_rate": 1e-06, "loss": 0.5473, "mean_token_accuracy": 0.8775364756584167, "num_tokens": 704346903.0, "step": 18459 }, { "epoch": 2.3483017427808166, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.285926818847656, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.870363712310791, "num_tokens": 704380456.0, "step": 18460 }, { "epoch": 2.348428953059407, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.268070220947266, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8735485076904297, "num_tokens": 704416403.0, "step": 18461 }, { "epoch": 2.3485561633379977, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.24967956542969, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8635185360908508, "num_tokens": 704460431.0, "step": 18462 }, { "epoch": 2.348683373616588, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.15060043334961, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8706244826316833, "num_tokens": 704497872.0, "step": 18463 }, { "epoch": 2.3488105838951787, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.229637145996094, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8646028637886047, "num_tokens": 704538180.0, "step": 18464 }, { "epoch": 2.3489377941737692, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.01466751098633, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.8757542371749878, "num_tokens": 704578342.0, "step": 18465 }, { "epoch": 2.3490650044523598, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.10031509399414, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8731949329376221, "num_tokens": 704611792.0, "step": 18466 }, { "epoch": 2.3491922147309503, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.74983596801758, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8655728101730347, "num_tokens": 704649898.0, "step": 18467 }, { "epoch": 2.349319425009541, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.329010009765625, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8715935349464417, "num_tokens": 704690632.0, "step": 18468 }, { "epoch": 2.3494466352881314, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.73181915283203, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8525468707084656, "num_tokens": 704722955.0, "step": 18469 }, { "epoch": 2.349573845566722, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.682308197021484, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8625730276107788, "num_tokens": 704762678.0, "step": 18470 }, { "epoch": 2.3497010558453124, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.381568908691406, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8886323571205139, "num_tokens": 704805440.0, "step": 18471 }, { "epoch": 2.349828266123903, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.63822555541992, "learning_rate": 1e-06, "loss": 0.6599, "mean_token_accuracy": 0.8452264666557312, "num_tokens": 704848153.0, "step": 18472 }, { "epoch": 2.3499554764024935, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.63186264038086, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8599849939346313, "num_tokens": 704886739.0, "step": 18473 }, { "epoch": 2.350082686681084, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.52837371826172, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.862663745880127, "num_tokens": 704926145.0, "step": 18474 }, { "epoch": 2.3502098969596745, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.94921875, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8461320400238037, "num_tokens": 704964135.0, "step": 18475 }, { "epoch": 2.350337107238265, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.40654373168945, "learning_rate": 1e-06, "loss": 0.6454, "mean_token_accuracy": 0.8506996631622314, "num_tokens": 705002171.0, "step": 18476 }, { "epoch": 2.350464317516855, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.40487289428711, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8821948170661926, "num_tokens": 705040660.0, "step": 18477 }, { "epoch": 2.350591527795446, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.577091217041016, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8702776432037354, "num_tokens": 705073655.0, "step": 18478 }, { "epoch": 2.350718738074036, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.020450592041016, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.858810544013977, "num_tokens": 705115497.0, "step": 18479 }, { "epoch": 2.3508459483526267, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.83879470825195, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8801026344299316, "num_tokens": 705152642.0, "step": 18480 }, { "epoch": 2.3509731586312173, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.73640060424805, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8464037775993347, "num_tokens": 705187282.0, "step": 18481 }, { "epoch": 2.351100368909808, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.80510330200195, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8670346140861511, "num_tokens": 705225635.0, "step": 18482 }, { "epoch": 2.3512275791883983, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.04120635986328, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.8722295761108398, "num_tokens": 705260376.0, "step": 18483 }, { "epoch": 2.351354789466989, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.47111511230469, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8603463172912598, "num_tokens": 705298799.0, "step": 18484 }, { "epoch": 2.3514819997455794, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.66460037231445, "learning_rate": 1e-06, "loss": 0.6383, "mean_token_accuracy": 0.8444626331329346, "num_tokens": 705332570.0, "step": 18485 }, { "epoch": 2.35160921002417, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.723812103271484, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8640998601913452, "num_tokens": 705371214.0, "step": 18486 }, { "epoch": 2.3517364203027604, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.89046859741211, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8590733408927917, "num_tokens": 705405051.0, "step": 18487 }, { "epoch": 2.351863630581351, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.83603286743164, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8614555597305298, "num_tokens": 705442460.0, "step": 18488 }, { "epoch": 2.3519908408599415, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.66612243652344, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8621206283569336, "num_tokens": 705482929.0, "step": 18489 }, { "epoch": 2.352118051138532, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.24846649169922, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8567782640457153, "num_tokens": 705525313.0, "step": 18490 }, { "epoch": 2.3522452614171225, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.54843521118164, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8577905893325806, "num_tokens": 705563730.0, "step": 18491 }, { "epoch": 2.352372471695713, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.91033172607422, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8654550313949585, "num_tokens": 705602459.0, "step": 18492 }, { "epoch": 2.3524996819743036, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.18185806274414, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8595520853996277, "num_tokens": 705636223.0, "step": 18493 }, { "epoch": 2.352626892252894, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.98897933959961, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8598553538322449, "num_tokens": 705676178.0, "step": 18494 }, { "epoch": 2.3527541025314846, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.869380950927734, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8694665431976318, "num_tokens": 705716687.0, "step": 18495 }, { "epoch": 2.352881312810075, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.537967681884766, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8708838224411011, "num_tokens": 705759163.0, "step": 18496 }, { "epoch": 2.3530085230886657, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.172142028808594, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8643672466278076, "num_tokens": 705795982.0, "step": 18497 }, { "epoch": 2.3531357333672562, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.03493118286133, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.864750862121582, "num_tokens": 705835601.0, "step": 18498 }, { "epoch": 2.3532629436458468, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.46833801269531, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8616828322410583, "num_tokens": 705876858.0, "step": 18499 }, { "epoch": 2.353390153924437, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 47.97723388671875, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.853880763053894, "num_tokens": 705921944.0, "step": 18500 }, { "epoch": 2.353517364203028, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.381900787353516, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8633304834365845, "num_tokens": 705967999.0, "step": 18501 }, { "epoch": 2.353644574481618, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.174461364746094, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8555642366409302, "num_tokens": 706003370.0, "step": 18502 }, { "epoch": 2.353771784760209, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.580204010009766, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8511344790458679, "num_tokens": 706036551.0, "step": 18503 }, { "epoch": 2.353898995038799, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.58132553100586, "learning_rate": 1e-06, "loss": 0.5402, "mean_token_accuracy": 0.8788291215896606, "num_tokens": 706075965.0, "step": 18504 }, { "epoch": 2.3540262053173895, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.833580017089844, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8518015146255493, "num_tokens": 706117943.0, "step": 18505 }, { "epoch": 2.35415341559598, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.41675567626953, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.866824746131897, "num_tokens": 706157183.0, "step": 18506 }, { "epoch": 2.3542806258745705, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.26680374145508, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8701292276382446, "num_tokens": 706196878.0, "step": 18507 }, { "epoch": 2.354407836153161, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.53211975097656, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8509001731872559, "num_tokens": 706231102.0, "step": 18508 }, { "epoch": 2.3545350464317516, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.36197280883789, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8707520961761475, "num_tokens": 706273372.0, "step": 18509 }, { "epoch": 2.354662256710342, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.61714553833008, "learning_rate": 1e-06, "loss": 0.6461, "mean_token_accuracy": 0.8565666675567627, "num_tokens": 706308106.0, "step": 18510 }, { "epoch": 2.3547894669889327, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.16679763793945, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8703188896179199, "num_tokens": 706344267.0, "step": 18511 }, { "epoch": 2.354916677267523, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.5605354309082, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8814363479614258, "num_tokens": 706385019.0, "step": 18512 }, { "epoch": 2.3550438875461137, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.93241882324219, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8611565232276917, "num_tokens": 706424913.0, "step": 18513 }, { "epoch": 2.3551710978247042, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.99912643432617, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.863683819770813, "num_tokens": 706466064.0, "step": 18514 }, { "epoch": 2.3552983081032948, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.6490478515625, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8677595257759094, "num_tokens": 706504572.0, "step": 18515 }, { "epoch": 2.3554255183818853, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.415565490722656, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8571396470069885, "num_tokens": 706543901.0, "step": 18516 }, { "epoch": 2.355552728660476, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.6099853515625, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8553630709648132, "num_tokens": 706579095.0, "step": 18517 }, { "epoch": 2.3556799389390664, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.11939239501953, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8541374206542969, "num_tokens": 706614078.0, "step": 18518 }, { "epoch": 2.355807149217657, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.57741928100586, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8693224191665649, "num_tokens": 706652596.0, "step": 18519 }, { "epoch": 2.3559343594962474, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.199989318847656, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8852489590644836, "num_tokens": 706691297.0, "step": 18520 }, { "epoch": 2.356061569774838, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.38877487182617, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8510088920593262, "num_tokens": 706727344.0, "step": 18521 }, { "epoch": 2.3561887800534285, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.3577880859375, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8672131299972534, "num_tokens": 706770411.0, "step": 18522 }, { "epoch": 2.356315990332019, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.050254821777344, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8663899898529053, "num_tokens": 706816215.0, "step": 18523 }, { "epoch": 2.3564432006106095, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.380897521972656, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8564566373825073, "num_tokens": 706857400.0, "step": 18524 }, { "epoch": 2.3565704108891996, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.129493713378906, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8537409901618958, "num_tokens": 706892715.0, "step": 18525 }, { "epoch": 2.3566976211677906, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.276893615722656, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8615312576293945, "num_tokens": 706934373.0, "step": 18526 }, { "epoch": 2.3568248314463807, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.63633728027344, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8637372851371765, "num_tokens": 706976885.0, "step": 18527 }, { "epoch": 2.356952041724971, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.17485809326172, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8693443536758423, "num_tokens": 707016957.0, "step": 18528 }, { "epoch": 2.3570792520035617, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.42518615722656, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8724894523620605, "num_tokens": 707053687.0, "step": 18529 }, { "epoch": 2.3572064622821522, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.27627182006836, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.856975257396698, "num_tokens": 707087327.0, "step": 18530 }, { "epoch": 2.3573336725607428, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 47.7397575378418, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8758681416511536, "num_tokens": 707124479.0, "step": 18531 }, { "epoch": 2.3574608828393333, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.99918746948242, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8670493960380554, "num_tokens": 707162452.0, "step": 18532 }, { "epoch": 2.357588093117924, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.074371337890625, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8727033138275146, "num_tokens": 707196765.0, "step": 18533 }, { "epoch": 2.3577153033965144, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.033382415771484, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8688449859619141, "num_tokens": 707230151.0, "step": 18534 }, { "epoch": 2.357842513675105, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.30928421020508, "learning_rate": 1e-06, "loss": 0.6502, "mean_token_accuracy": 0.8452056050300598, "num_tokens": 707264373.0, "step": 18535 }, { "epoch": 2.3579697239536954, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.95549011230469, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8581758141517639, "num_tokens": 707304844.0, "step": 18536 }, { "epoch": 2.358096934232286, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.726280212402344, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8646180629730225, "num_tokens": 707347081.0, "step": 18537 }, { "epoch": 2.3582241445108765, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.517173767089844, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8562746047973633, "num_tokens": 707388243.0, "step": 18538 }, { "epoch": 2.358351354789467, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.7702751159668, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.8503616452217102, "num_tokens": 707427109.0, "step": 18539 }, { "epoch": 2.3584785650680575, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.540122985839844, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.865105152130127, "num_tokens": 707462502.0, "step": 18540 }, { "epoch": 2.358605775346648, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.1171989440918, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8563910722732544, "num_tokens": 707502861.0, "step": 18541 }, { "epoch": 2.3587329856252386, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.059391021728516, "learning_rate": 1e-06, "loss": 0.6651, "mean_token_accuracy": 0.8422768115997314, "num_tokens": 707535868.0, "step": 18542 }, { "epoch": 2.358860195903829, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.111053466796875, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8653160333633423, "num_tokens": 707566128.0, "step": 18543 }, { "epoch": 2.3589874061824196, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.20198059082031, "learning_rate": 1e-06, "loss": 0.5698, "mean_token_accuracy": 0.8694288730621338, "num_tokens": 707605113.0, "step": 18544 }, { "epoch": 2.35911461646101, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.83507537841797, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8598097562789917, "num_tokens": 707647968.0, "step": 18545 }, { "epoch": 2.3592418267396007, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.94155502319336, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.870991587638855, "num_tokens": 707682183.0, "step": 18546 }, { "epoch": 2.3593690370181912, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.114444732666016, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8717908263206482, "num_tokens": 707720938.0, "step": 18547 }, { "epoch": 2.3594962472967818, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.86932373046875, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8520327806472778, "num_tokens": 707762624.0, "step": 18548 }, { "epoch": 2.3596234575753723, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.0704460144043, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.853575587272644, "num_tokens": 707798406.0, "step": 18549 }, { "epoch": 2.3597506678539624, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.433101654052734, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8629053831100464, "num_tokens": 707835935.0, "step": 18550 }, { "epoch": 2.3598778781325533, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.6652717590332, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8735266327857971, "num_tokens": 707872971.0, "step": 18551 }, { "epoch": 2.3600050884111434, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.47711181640625, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8730943202972412, "num_tokens": 707909798.0, "step": 18552 }, { "epoch": 2.360132298689734, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.780029296875, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.866169273853302, "num_tokens": 707951130.0, "step": 18553 }, { "epoch": 2.3602595089683245, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.40048599243164, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8503046631813049, "num_tokens": 707989502.0, "step": 18554 }, { "epoch": 2.360386719246915, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.81996536254883, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8617976903915405, "num_tokens": 708023521.0, "step": 18555 }, { "epoch": 2.3605139295255055, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.85690689086914, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8688852190971375, "num_tokens": 708057594.0, "step": 18556 }, { "epoch": 2.360641139804096, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.77529525756836, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8601326942443848, "num_tokens": 708101393.0, "step": 18557 }, { "epoch": 2.3607683500826866, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.47362518310547, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8739893436431885, "num_tokens": 708137141.0, "step": 18558 }, { "epoch": 2.360895560361277, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.79375076293945, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8665528297424316, "num_tokens": 708173497.0, "step": 18559 }, { "epoch": 2.3610227706398677, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.59203338623047, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8639810085296631, "num_tokens": 708209412.0, "step": 18560 }, { "epoch": 2.361149980918458, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.4092903137207, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8713339567184448, "num_tokens": 708239675.0, "step": 18561 }, { "epoch": 2.3612771911970487, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.71139144897461, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.856857180595398, "num_tokens": 708274918.0, "step": 18562 }, { "epoch": 2.3614044014756392, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.42540740966797, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8484887480735779, "num_tokens": 708313645.0, "step": 18563 }, { "epoch": 2.3615316117542298, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.80082321166992, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8645588755607605, "num_tokens": 708357985.0, "step": 18564 }, { "epoch": 2.3616588220328203, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.76893997192383, "learning_rate": 1e-06, "loss": 0.6732, "mean_token_accuracy": 0.8411233425140381, "num_tokens": 708393606.0, "step": 18565 }, { "epoch": 2.361786032311411, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.032615661621094, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.869554877281189, "num_tokens": 708428941.0, "step": 18566 }, { "epoch": 2.3619132425900013, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.946502685546875, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8676512837409973, "num_tokens": 708467847.0, "step": 18567 }, { "epoch": 2.362040452868592, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.903465270996094, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8801835179328918, "num_tokens": 708503212.0, "step": 18568 }, { "epoch": 2.3621676631471824, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.15142059326172, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8607327342033386, "num_tokens": 708542897.0, "step": 18569 }, { "epoch": 2.362294873425773, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.102054595947266, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.85069739818573, "num_tokens": 708574725.0, "step": 18570 }, { "epoch": 2.3624220837043635, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.551513671875, "learning_rate": 1e-06, "loss": 0.5525, "mean_token_accuracy": 0.8743553161621094, "num_tokens": 708613557.0, "step": 18571 }, { "epoch": 2.362549293982954, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.50224304199219, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8567123413085938, "num_tokens": 708651872.0, "step": 18572 }, { "epoch": 2.3626765042615445, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.83379364013672, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.8497481942176819, "num_tokens": 708688654.0, "step": 18573 }, { "epoch": 2.362803714540135, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.59612274169922, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.868162989616394, "num_tokens": 708723285.0, "step": 18574 }, { "epoch": 2.362930924818725, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.346046447753906, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.8721194267272949, "num_tokens": 708760617.0, "step": 18575 }, { "epoch": 2.363058135097316, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.06233215332031, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8683000802993774, "num_tokens": 708790792.0, "step": 18576 }, { "epoch": 2.363185345375906, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.24913024902344, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8534853458404541, "num_tokens": 708829935.0, "step": 18577 }, { "epoch": 2.3633125556544967, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.74338912963867, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8554563522338867, "num_tokens": 708870578.0, "step": 18578 }, { "epoch": 2.3634397659330872, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.593135833740234, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8619141578674316, "num_tokens": 708907044.0, "step": 18579 }, { "epoch": 2.3635669762116778, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.95283889770508, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8456363677978516, "num_tokens": 708947443.0, "step": 18580 }, { "epoch": 2.3636941864902683, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.58033752441406, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8653129935264587, "num_tokens": 708983174.0, "step": 18581 }, { "epoch": 2.363821396768859, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.11429214477539, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8663103580474854, "num_tokens": 709021950.0, "step": 18582 }, { "epoch": 2.3639486070474494, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.32440185546875, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8690847158432007, "num_tokens": 709057236.0, "step": 18583 }, { "epoch": 2.36407581732604, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.3331298828125, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8648151159286499, "num_tokens": 709093376.0, "step": 18584 }, { "epoch": 2.3642030276046304, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.11628341674805, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8571085929870605, "num_tokens": 709129502.0, "step": 18585 }, { "epoch": 2.364330237883221, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.126869201660156, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8609920740127563, "num_tokens": 709162538.0, "step": 18586 }, { "epoch": 2.3644574481618115, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.89940643310547, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8507611751556396, "num_tokens": 709201690.0, "step": 18587 }, { "epoch": 2.364584658440402, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.65135955810547, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8561865091323853, "num_tokens": 709240593.0, "step": 18588 }, { "epoch": 2.3647118687189925, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.308929443359375, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8732633590698242, "num_tokens": 709278192.0, "step": 18589 }, { "epoch": 2.364839078997583, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.71332550048828, "learning_rate": 1e-06, "loss": 0.531, "mean_token_accuracy": 0.8808525800704956, "num_tokens": 709311797.0, "step": 18590 }, { "epoch": 2.3649662892761736, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.623130798339844, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8613038063049316, "num_tokens": 709350379.0, "step": 18591 }, { "epoch": 2.365093499554764, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.42579650878906, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8893029093742371, "num_tokens": 709389030.0, "step": 18592 }, { "epoch": 2.3652207098333546, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.9217529296875, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8561989665031433, "num_tokens": 709433692.0, "step": 18593 }, { "epoch": 2.365347920111945, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.28743362426758, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8619523048400879, "num_tokens": 709470773.0, "step": 18594 }, { "epoch": 2.3654751303905357, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.13677215576172, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8577052354812622, "num_tokens": 709506764.0, "step": 18595 }, { "epoch": 2.3656023406691262, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.34556198120117, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.859308123588562, "num_tokens": 709546931.0, "step": 18596 }, { "epoch": 2.3657295509477168, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.35093688964844, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8639736175537109, "num_tokens": 709582842.0, "step": 18597 }, { "epoch": 2.365856761226307, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.15046691894531, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8634399175643921, "num_tokens": 709621574.0, "step": 18598 }, { "epoch": 2.365983971504898, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.171104431152344, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8804214596748352, "num_tokens": 709662437.0, "step": 18599 }, { "epoch": 2.366111181783488, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.82539367675781, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.876996636390686, "num_tokens": 709701318.0, "step": 18600 }, { "epoch": 2.3662383920620784, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.15255355834961, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8638733625411987, "num_tokens": 709745156.0, "step": 18601 }, { "epoch": 2.366365602340669, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.61803436279297, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.856339693069458, "num_tokens": 709785849.0, "step": 18602 }, { "epoch": 2.3664928126192595, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.32569885253906, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8683760166168213, "num_tokens": 709830038.0, "step": 18603 }, { "epoch": 2.36662002289785, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.675052642822266, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.859481155872345, "num_tokens": 709871174.0, "step": 18604 }, { "epoch": 2.3667472331764405, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.575321197509766, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8843663334846497, "num_tokens": 709905921.0, "step": 18605 }, { "epoch": 2.366874443455031, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.60405731201172, "learning_rate": 1e-06, "loss": 0.5489, "mean_token_accuracy": 0.8782762289047241, "num_tokens": 709939856.0, "step": 18606 }, { "epoch": 2.3670016537336216, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.58895492553711, "learning_rate": 1e-06, "loss": 0.5472, "mean_token_accuracy": 0.8762380480766296, "num_tokens": 709978805.0, "step": 18607 }, { "epoch": 2.367128864012212, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.34529495239258, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8769342303276062, "num_tokens": 710015015.0, "step": 18608 }, { "epoch": 2.3672560742908026, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.9100456237793, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8553193807601929, "num_tokens": 710054887.0, "step": 18609 }, { "epoch": 2.367383284569393, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.97285461425781, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.863663911819458, "num_tokens": 710097012.0, "step": 18610 }, { "epoch": 2.3675104948479837, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.410789489746094, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8477516770362854, "num_tokens": 710130912.0, "step": 18611 }, { "epoch": 2.3676377051265742, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.64854049682617, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.860171377658844, "num_tokens": 710168880.0, "step": 18612 }, { "epoch": 2.3677649154051648, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.329254150390625, "learning_rate": 1e-06, "loss": 0.5737, "mean_token_accuracy": 0.8731234669685364, "num_tokens": 710205619.0, "step": 18613 }, { "epoch": 2.3678921256837553, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.44860076904297, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8698666095733643, "num_tokens": 710239475.0, "step": 18614 }, { "epoch": 2.368019335962346, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.237464904785156, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.866866946220398, "num_tokens": 710276533.0, "step": 18615 }, { "epoch": 2.3681465462409363, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.3776969909668, "learning_rate": 1e-06, "loss": 0.5436, "mean_token_accuracy": 0.8787937164306641, "num_tokens": 710316731.0, "step": 18616 }, { "epoch": 2.368273756519527, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.192955017089844, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.859165370464325, "num_tokens": 710352310.0, "step": 18617 }, { "epoch": 2.3684009667981174, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.89115524291992, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8589222431182861, "num_tokens": 710400203.0, "step": 18618 }, { "epoch": 2.368528177076708, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.58970642089844, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8694930672645569, "num_tokens": 710433054.0, "step": 18619 }, { "epoch": 2.3686553873552985, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.35171127319336, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8669708967208862, "num_tokens": 710466879.0, "step": 18620 }, { "epoch": 2.368782597633889, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.514625549316406, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8556762933731079, "num_tokens": 710502808.0, "step": 18621 }, { "epoch": 2.3689098079124795, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.64335632324219, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8520641326904297, "num_tokens": 710546190.0, "step": 18622 }, { "epoch": 2.3690370181910696, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.643211364746094, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8643871545791626, "num_tokens": 710585556.0, "step": 18623 }, { "epoch": 2.3691642284696606, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.68921661376953, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.866595983505249, "num_tokens": 710621366.0, "step": 18624 }, { "epoch": 2.3692914387482507, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.70402145385742, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8684815764427185, "num_tokens": 710660217.0, "step": 18625 }, { "epoch": 2.369418649026841, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.694644927978516, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8591938614845276, "num_tokens": 710695798.0, "step": 18626 }, { "epoch": 2.3695458593054317, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.23166275024414, "learning_rate": 1e-06, "loss": 0.658, "mean_token_accuracy": 0.846531867980957, "num_tokens": 710732739.0, "step": 18627 }, { "epoch": 2.3696730695840222, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.619136810302734, "learning_rate": 1e-06, "loss": 0.6297, "mean_token_accuracy": 0.8544526100158691, "num_tokens": 710769651.0, "step": 18628 }, { "epoch": 2.3698002798626128, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.227054595947266, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.861872673034668, "num_tokens": 710809487.0, "step": 18629 }, { "epoch": 2.3699274901412033, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.83111572265625, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8674107789993286, "num_tokens": 710849152.0, "step": 18630 }, { "epoch": 2.370054700419794, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.38566207885742, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8484872579574585, "num_tokens": 710890933.0, "step": 18631 }, { "epoch": 2.3701819106983844, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.96714401245117, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8667892217636108, "num_tokens": 710928262.0, "step": 18632 }, { "epoch": 2.370309120976975, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.43849563598633, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8688175678253174, "num_tokens": 710962969.0, "step": 18633 }, { "epoch": 2.3704363312555654, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.57036209106445, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8596103191375732, "num_tokens": 710998717.0, "step": 18634 }, { "epoch": 2.370563541534156, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.37590026855469, "learning_rate": 1e-06, "loss": 0.6436, "mean_token_accuracy": 0.8477535247802734, "num_tokens": 711032819.0, "step": 18635 }, { "epoch": 2.3706907518127465, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.69126510620117, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8544284105300903, "num_tokens": 711069395.0, "step": 18636 }, { "epoch": 2.370817962091337, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.43690490722656, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8644485473632812, "num_tokens": 711105754.0, "step": 18637 }, { "epoch": 2.3709451723699275, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.72980499267578, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8582048416137695, "num_tokens": 711145291.0, "step": 18638 }, { "epoch": 2.371072382648518, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.78445816040039, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8638778924942017, "num_tokens": 711183431.0, "step": 18639 }, { "epoch": 2.3711995929271086, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.716217041015625, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8491288423538208, "num_tokens": 711229754.0, "step": 18640 }, { "epoch": 2.371326803205699, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.03269577026367, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.86442631483078, "num_tokens": 711269071.0, "step": 18641 }, { "epoch": 2.3714540134842896, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.5344352722168, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8606134653091431, "num_tokens": 711312036.0, "step": 18642 }, { "epoch": 2.37158122376288, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.0611457824707, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8715310096740723, "num_tokens": 711347945.0, "step": 18643 }, { "epoch": 2.3717084340414707, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.80208969116211, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8408297896385193, "num_tokens": 711388433.0, "step": 18644 }, { "epoch": 2.371835644320061, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.125709533691406, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8677864074707031, "num_tokens": 711425821.0, "step": 18645 }, { "epoch": 2.3719628545986517, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.44144058227539, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8574367165565491, "num_tokens": 711462725.0, "step": 18646 }, { "epoch": 2.3720900648772423, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.36113357543945, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.878203272819519, "num_tokens": 711499987.0, "step": 18647 }, { "epoch": 2.3722172751558324, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.836490631103516, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.846304714679718, "num_tokens": 711543658.0, "step": 18648 }, { "epoch": 2.3723444854344233, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.15943908691406, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8756484389305115, "num_tokens": 711583875.0, "step": 18649 }, { "epoch": 2.3724716957130134, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.66485595703125, "learning_rate": 1e-06, "loss": 0.6464, "mean_token_accuracy": 0.8452693819999695, "num_tokens": 711625543.0, "step": 18650 }, { "epoch": 2.372598905991604, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.43238067626953, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8646805286407471, "num_tokens": 711660528.0, "step": 18651 }, { "epoch": 2.3727261162701945, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.14778137207031, "learning_rate": 1e-06, "loss": 0.55, "mean_token_accuracy": 0.8776983618736267, "num_tokens": 711695646.0, "step": 18652 }, { "epoch": 2.372853326548785, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.046302795410156, "learning_rate": 1e-06, "loss": 0.6512, "mean_token_accuracy": 0.8468188047409058, "num_tokens": 711734691.0, "step": 18653 }, { "epoch": 2.3729805368273755, "ewc_loss": 0.1884765625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016498565673828125, "grad_norm": 48.050743103027344, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8661767244338989, "num_tokens": 711771518.0, "step": 18654 }, { "epoch": 2.373107747105966, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.74864196777344, "learning_rate": 1e-06, "loss": 0.6427, "mean_token_accuracy": 0.8526309728622437, "num_tokens": 711808968.0, "step": 18655 }, { "epoch": 2.3732349573845566, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.466156005859375, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.870608925819397, "num_tokens": 711842944.0, "step": 18656 }, { "epoch": 2.373362167663147, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.457069396972656, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8595244884490967, "num_tokens": 711882740.0, "step": 18657 }, { "epoch": 2.3734893779417376, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.373512268066406, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8577439785003662, "num_tokens": 711925362.0, "step": 18658 }, { "epoch": 2.373616588220328, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.31376647949219, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8677628040313721, "num_tokens": 711971285.0, "step": 18659 }, { "epoch": 2.3737437984989187, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.62202072143555, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8586016893386841, "num_tokens": 712013306.0, "step": 18660 }, { "epoch": 2.3738710087775092, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.76134490966797, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8639888167381287, "num_tokens": 712050837.0, "step": 18661 }, { "epoch": 2.3739982190560998, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.228965759277344, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8790039420127869, "num_tokens": 712085140.0, "step": 18662 }, { "epoch": 2.3741254293346903, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.80509567260742, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8772685527801514, "num_tokens": 712122495.0, "step": 18663 }, { "epoch": 2.374252639613281, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.12055587768555, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8574137091636658, "num_tokens": 712161312.0, "step": 18664 }, { "epoch": 2.3743798498918713, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.83675765991211, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8613966107368469, "num_tokens": 712197275.0, "step": 18665 }, { "epoch": 2.374507060170462, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.693485260009766, "learning_rate": 1e-06, "loss": 0.6355, "mean_token_accuracy": 0.8503402471542358, "num_tokens": 712231309.0, "step": 18666 }, { "epoch": 2.3746342704490524, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.99749755859375, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8583507537841797, "num_tokens": 712264864.0, "step": 18667 }, { "epoch": 2.374761480727643, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.9618034362793, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8641911745071411, "num_tokens": 712303564.0, "step": 18668 }, { "epoch": 2.3748886910062335, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.81998062133789, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8729682564735413, "num_tokens": 712341872.0, "step": 18669 }, { "epoch": 2.375015901284824, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.18116760253906, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8512222766876221, "num_tokens": 712379835.0, "step": 18670 }, { "epoch": 2.3751431115634145, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.01437759399414, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8594168424606323, "num_tokens": 712417462.0, "step": 18671 }, { "epoch": 2.375270321842005, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.6807746887207, "learning_rate": 1e-06, "loss": 0.6944, "mean_token_accuracy": 0.8359282612800598, "num_tokens": 712461581.0, "step": 18672 }, { "epoch": 2.375397532120595, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.72688293457031, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8620906472206116, "num_tokens": 712499615.0, "step": 18673 }, { "epoch": 2.375524742399186, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.750389099121094, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8735678195953369, "num_tokens": 712533264.0, "step": 18674 }, { "epoch": 2.375651952677776, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.74668884277344, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8552427291870117, "num_tokens": 712571496.0, "step": 18675 }, { "epoch": 2.3757791629563667, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.43730926513672, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8699593544006348, "num_tokens": 712606510.0, "step": 18676 }, { "epoch": 2.3759063732349572, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.41907501220703, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8716617822647095, "num_tokens": 712646218.0, "step": 18677 }, { "epoch": 2.3760335835135478, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.22040557861328, "learning_rate": 1e-06, "loss": 0.7054, "mean_token_accuracy": 0.8300334215164185, "num_tokens": 712681905.0, "step": 18678 }, { "epoch": 2.3761607937921383, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.393959045410156, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8754543662071228, "num_tokens": 712719916.0, "step": 18679 }, { "epoch": 2.376288004070729, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.87314224243164, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8640820980072021, "num_tokens": 712760849.0, "step": 18680 }, { "epoch": 2.3764152143493193, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.49831771850586, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8739252090454102, "num_tokens": 712801199.0, "step": 18681 }, { "epoch": 2.37654242462791, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.49593734741211, "learning_rate": 1e-06, "loss": 0.6194, "mean_token_accuracy": 0.8565273880958557, "num_tokens": 712843471.0, "step": 18682 }, { "epoch": 2.3766696349065004, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.466575622558594, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8638826012611389, "num_tokens": 712880649.0, "step": 18683 }, { "epoch": 2.376796845185091, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.2233772277832, "learning_rate": 1e-06, "loss": 0.6684, "mean_token_accuracy": 0.8396031856536865, "num_tokens": 712920574.0, "step": 18684 }, { "epoch": 2.3769240554636815, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.48884963989258, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8732105493545532, "num_tokens": 712962544.0, "step": 18685 }, { "epoch": 2.377051265742272, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.162105560302734, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8655329346656799, "num_tokens": 712999138.0, "step": 18686 }, { "epoch": 2.3771784760208625, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.622650146484375, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8635247349739075, "num_tokens": 713041799.0, "step": 18687 }, { "epoch": 2.377305686299453, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.33847427368164, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.856287956237793, "num_tokens": 713088248.0, "step": 18688 }, { "epoch": 2.3774328965780436, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.66289138793945, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8711777925491333, "num_tokens": 713127239.0, "step": 18689 }, { "epoch": 2.377560106856634, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.47074890136719, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8673163056373596, "num_tokens": 713168389.0, "step": 18690 }, { "epoch": 2.3776873171352246, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.10802459716797, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8560680747032166, "num_tokens": 713208137.0, "step": 18691 }, { "epoch": 2.377814527413815, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.16438674926758, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8536252975463867, "num_tokens": 713249344.0, "step": 18692 }, { "epoch": 2.3779417376924057, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.54386901855469, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8591905236244202, "num_tokens": 713291376.0, "step": 18693 }, { "epoch": 2.378068947970996, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.36017990112305, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8613402247428894, "num_tokens": 713329925.0, "step": 18694 }, { "epoch": 2.3781961582495867, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.53697204589844, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.865392804145813, "num_tokens": 713372509.0, "step": 18695 }, { "epoch": 2.378323368528177, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.23910903930664, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.8563673496246338, "num_tokens": 713410055.0, "step": 18696 }, { "epoch": 2.378450578806768, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.50212860107422, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8603808879852295, "num_tokens": 713453958.0, "step": 18697 }, { "epoch": 2.378577789085358, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.353126525878906, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.861481249332428, "num_tokens": 713485683.0, "step": 18698 }, { "epoch": 2.3787049993639484, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.43430709838867, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8628928661346436, "num_tokens": 713528565.0, "step": 18699 }, { "epoch": 2.378832209642539, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.48133087158203, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.858356237411499, "num_tokens": 713560932.0, "step": 18700 }, { "epoch": 2.3789594199211295, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.31364822387695, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8575504422187805, "num_tokens": 713592891.0, "step": 18701 }, { "epoch": 2.37908663019972, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.26425552368164, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8596226572990417, "num_tokens": 713631285.0, "step": 18702 }, { "epoch": 2.3792138404783105, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.26464080810547, "learning_rate": 1e-06, "loss": 0.6641, "mean_token_accuracy": 0.8451681137084961, "num_tokens": 713675143.0, "step": 18703 }, { "epoch": 2.379341050756901, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.19757843017578, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.868001401424408, "num_tokens": 713717556.0, "step": 18704 }, { "epoch": 2.3794682610354916, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.2392463684082, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8543677926063538, "num_tokens": 713763515.0, "step": 18705 }, { "epoch": 2.379595471314082, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.06487274169922, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8675670027732849, "num_tokens": 713801064.0, "step": 18706 }, { "epoch": 2.3797226815926726, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.650367736816406, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8584601879119873, "num_tokens": 713841662.0, "step": 18707 }, { "epoch": 2.379849891871263, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.51082992553711, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8661409020423889, "num_tokens": 713883646.0, "step": 18708 }, { "epoch": 2.3799771021498537, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.862918853759766, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8832988739013672, "num_tokens": 713925365.0, "step": 18709 }, { "epoch": 2.3801043124284442, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.509212493896484, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8597846031188965, "num_tokens": 713961142.0, "step": 18710 }, { "epoch": 2.3802315227070348, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.96665573120117, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8699190616607666, "num_tokens": 714001652.0, "step": 18711 }, { "epoch": 2.3803587329856253, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.70500183105469, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8652697205543518, "num_tokens": 714043023.0, "step": 18712 }, { "epoch": 2.380485943264216, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.53961944580078, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8618508577346802, "num_tokens": 714080389.0, "step": 18713 }, { "epoch": 2.3806131535428063, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.21900177001953, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8666520118713379, "num_tokens": 714117695.0, "step": 18714 }, { "epoch": 2.380740363821397, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.374603271484375, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8701863288879395, "num_tokens": 714154204.0, "step": 18715 }, { "epoch": 2.3808675740999874, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.37586212158203, "learning_rate": 1e-06, "loss": 0.6606, "mean_token_accuracy": 0.8458271622657776, "num_tokens": 714194857.0, "step": 18716 }, { "epoch": 2.380994784378578, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.669673919677734, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8736758232116699, "num_tokens": 714228034.0, "step": 18717 }, { "epoch": 2.3811219946571685, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.211997985839844, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8679080605506897, "num_tokens": 714269688.0, "step": 18718 }, { "epoch": 2.381249204935759, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.680179595947266, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8576076626777649, "num_tokens": 714309039.0, "step": 18719 }, { "epoch": 2.3813764152143495, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.134864807128906, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8678555488586426, "num_tokens": 714347453.0, "step": 18720 }, { "epoch": 2.3815036254929396, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.46736526489258, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8601320385932922, "num_tokens": 714385850.0, "step": 18721 }, { "epoch": 2.3816308357715306, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.30348205566406, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.8582525849342346, "num_tokens": 714426735.0, "step": 18722 }, { "epoch": 2.3817580460501206, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.58999252319336, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8742448091506958, "num_tokens": 714465279.0, "step": 18723 }, { "epoch": 2.381885256328711, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.7537727355957, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.851211667060852, "num_tokens": 714505703.0, "step": 18724 }, { "epoch": 2.3820124666073017, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.777854919433594, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8555516600608826, "num_tokens": 714543642.0, "step": 18725 }, { "epoch": 2.3821396768858922, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.675567626953125, "learning_rate": 1e-06, "loss": 0.5496, "mean_token_accuracy": 0.8791719675064087, "num_tokens": 714583533.0, "step": 18726 }, { "epoch": 2.3822668871644828, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.9087028503418, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8675903081893921, "num_tokens": 714625577.0, "step": 18727 }, { "epoch": 2.3823940974430733, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.74490737915039, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8578222990036011, "num_tokens": 714664987.0, "step": 18728 }, { "epoch": 2.382521307721664, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.965267181396484, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8589543104171753, "num_tokens": 714705881.0, "step": 18729 }, { "epoch": 2.3826485180002543, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.42097091674805, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8534071445465088, "num_tokens": 714746012.0, "step": 18730 }, { "epoch": 2.382775728278845, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.607261657714844, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8641631007194519, "num_tokens": 714790001.0, "step": 18731 }, { "epoch": 2.3829029385574354, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.978370666503906, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8711216449737549, "num_tokens": 714824515.0, "step": 18732 }, { "epoch": 2.383030148836026, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.52547073364258, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8550366759300232, "num_tokens": 714860841.0, "step": 18733 }, { "epoch": 2.3831573591146165, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.728050231933594, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8662045001983643, "num_tokens": 714896373.0, "step": 18734 }, { "epoch": 2.383284569393207, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.8240852355957, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8652778267860413, "num_tokens": 714926684.0, "step": 18735 }, { "epoch": 2.3834117796717975, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.84296798706055, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8782711029052734, "num_tokens": 714961057.0, "step": 18736 }, { "epoch": 2.383538989950388, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.81059265136719, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8685243725776672, "num_tokens": 715001215.0, "step": 18737 }, { "epoch": 2.3836662002289786, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.167171478271484, "learning_rate": 1e-06, "loss": 0.6665, "mean_token_accuracy": 0.8409680128097534, "num_tokens": 715038252.0, "step": 18738 }, { "epoch": 2.383793410507569, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.46589660644531, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8625921607017517, "num_tokens": 715078320.0, "step": 18739 }, { "epoch": 2.3839206207861596, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.38059616088867, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.8564177751541138, "num_tokens": 715115628.0, "step": 18740 }, { "epoch": 2.38404783106475, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.47053527832031, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8724228143692017, "num_tokens": 715153294.0, "step": 18741 }, { "epoch": 2.3841750413433407, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.47438430786133, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8634822368621826, "num_tokens": 715193736.0, "step": 18742 }, { "epoch": 2.384302251621931, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.82541275024414, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8646368980407715, "num_tokens": 715234539.0, "step": 18743 }, { "epoch": 2.3844294619005217, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.27391815185547, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8753550052642822, "num_tokens": 715271567.0, "step": 18744 }, { "epoch": 2.3845566721791123, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.96987533569336, "learning_rate": 1e-06, "loss": 0.555, "mean_token_accuracy": 0.8761186003684998, "num_tokens": 715314850.0, "step": 18745 }, { "epoch": 2.3846838824577024, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.202999114990234, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8687357902526855, "num_tokens": 715353353.0, "step": 18746 }, { "epoch": 2.3848110927362933, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.95900344848633, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8732686042785645, "num_tokens": 715387843.0, "step": 18747 }, { "epoch": 2.3849383030148834, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.96784591674805, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8497782945632935, "num_tokens": 715428037.0, "step": 18748 }, { "epoch": 2.385065513293474, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.895484924316406, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8768370747566223, "num_tokens": 715464403.0, "step": 18749 }, { "epoch": 2.3851927235720645, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.6338996887207, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8738162517547607, "num_tokens": 715501820.0, "step": 18750 }, { "epoch": 2.385319933850655, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.16277313232422, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8747151494026184, "num_tokens": 715543287.0, "step": 18751 }, { "epoch": 2.3854471441292455, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.07711410522461, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8724199533462524, "num_tokens": 715578189.0, "step": 18752 }, { "epoch": 2.385574354407836, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.984458923339844, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8529686331748962, "num_tokens": 715616447.0, "step": 18753 }, { "epoch": 2.3857015646864266, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.498451232910156, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8679471611976624, "num_tokens": 715651692.0, "step": 18754 }, { "epoch": 2.385828774965017, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.38618087768555, "learning_rate": 1e-06, "loss": 0.6252, "mean_token_accuracy": 0.8548550605773926, "num_tokens": 715687733.0, "step": 18755 }, { "epoch": 2.3859559852436076, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.83702087402344, "learning_rate": 1e-06, "loss": 0.6646, "mean_token_accuracy": 0.8401782512664795, "num_tokens": 715727481.0, "step": 18756 }, { "epoch": 2.386083195522198, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.13072967529297, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8452989459037781, "num_tokens": 715767549.0, "step": 18757 }, { "epoch": 2.3862104058007887, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.31770706176758, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8543490171432495, "num_tokens": 715807251.0, "step": 18758 }, { "epoch": 2.386337616079379, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.25434112548828, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8458646535873413, "num_tokens": 715846893.0, "step": 18759 }, { "epoch": 2.3864648263579697, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.54296112060547, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.8476702570915222, "num_tokens": 715883611.0, "step": 18760 }, { "epoch": 2.3865920366365603, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.1773681640625, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8640676736831665, "num_tokens": 715915598.0, "step": 18761 }, { "epoch": 2.386719246915151, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.098819732666016, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8702353239059448, "num_tokens": 715956647.0, "step": 18762 }, { "epoch": 2.3868464571937413, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.32511901855469, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8697947859764099, "num_tokens": 715996627.0, "step": 18763 }, { "epoch": 2.386973667472332, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.64793395996094, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8840126991271973, "num_tokens": 716027823.0, "step": 18764 }, { "epoch": 2.3871008777509224, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.670265197753906, "learning_rate": 1e-06, "loss": 0.6522, "mean_token_accuracy": 0.8477479815483093, "num_tokens": 716067430.0, "step": 18765 }, { "epoch": 2.387228088029513, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.83452224731445, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8551720380783081, "num_tokens": 716109436.0, "step": 18766 }, { "epoch": 2.3873552983081034, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.11052322387695, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8587133884429932, "num_tokens": 716148157.0, "step": 18767 }, { "epoch": 2.387482508586694, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.82730484008789, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8656638860702515, "num_tokens": 716184662.0, "step": 18768 }, { "epoch": 2.3876097188652845, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.175987243652344, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8557482361793518, "num_tokens": 716227382.0, "step": 18769 }, { "epoch": 2.387736929143875, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.407920837402344, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8658649921417236, "num_tokens": 716262236.0, "step": 18770 }, { "epoch": 2.387864139422465, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.26335144042969, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.8723964095115662, "num_tokens": 716295541.0, "step": 18771 }, { "epoch": 2.387991349701056, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.27616882324219, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8662379384040833, "num_tokens": 716331796.0, "step": 18772 }, { "epoch": 2.388118559979646, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.34815216064453, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8541200757026672, "num_tokens": 716369539.0, "step": 18773 }, { "epoch": 2.3882457702582367, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.95940017700195, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8804082870483398, "num_tokens": 716409806.0, "step": 18774 }, { "epoch": 2.3883729805368272, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.332740783691406, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8544341921806335, "num_tokens": 716448191.0, "step": 18775 }, { "epoch": 2.3885001908154178, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.788326263427734, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8652113676071167, "num_tokens": 716486522.0, "step": 18776 }, { "epoch": 2.3886274010940083, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.31740951538086, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.862714409828186, "num_tokens": 716519383.0, "step": 18777 }, { "epoch": 2.388754611372599, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.68250274658203, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8650607466697693, "num_tokens": 716556136.0, "step": 18778 }, { "epoch": 2.3888818216511893, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.34296417236328, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8558083772659302, "num_tokens": 716592346.0, "step": 18779 }, { "epoch": 2.38900903192978, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.69231033325195, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8573654294013977, "num_tokens": 716633996.0, "step": 18780 }, { "epoch": 2.3891362422083704, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.57655715942383, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8575417399406433, "num_tokens": 716669638.0, "step": 18781 }, { "epoch": 2.389263452486961, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.51237106323242, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8530469536781311, "num_tokens": 716711194.0, "step": 18782 }, { "epoch": 2.3893906627655515, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.205787658691406, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8686573505401611, "num_tokens": 716753402.0, "step": 18783 }, { "epoch": 2.389517873044142, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.43791580200195, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8664265871047974, "num_tokens": 716794638.0, "step": 18784 }, { "epoch": 2.3896450833227325, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.186153411865234, "learning_rate": 1e-06, "loss": 0.6849, "mean_token_accuracy": 0.8327071070671082, "num_tokens": 716834003.0, "step": 18785 }, { "epoch": 2.389772293601323, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.685550689697266, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8683314323425293, "num_tokens": 716867349.0, "step": 18786 }, { "epoch": 2.3898995038799136, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.193382263183594, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8627686500549316, "num_tokens": 716904271.0, "step": 18787 }, { "epoch": 2.390026714158504, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.39506530761719, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8634028434753418, "num_tokens": 716936165.0, "step": 18788 }, { "epoch": 2.3901539244370946, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.509090423583984, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8606330156326294, "num_tokens": 716974011.0, "step": 18789 }, { "epoch": 2.390281134715685, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.70016098022461, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8624714612960815, "num_tokens": 717012438.0, "step": 18790 }, { "epoch": 2.3904083449942757, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.395355224609375, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.853721559047699, "num_tokens": 717051755.0, "step": 18791 }, { "epoch": 2.390535555272866, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.722862243652344, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8637573719024658, "num_tokens": 717090740.0, "step": 18792 }, { "epoch": 2.3906627655514567, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.25776290893555, "learning_rate": 1e-06, "loss": 0.6719, "mean_token_accuracy": 0.8365751504898071, "num_tokens": 717132033.0, "step": 18793 }, { "epoch": 2.390789975830047, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.5738639831543, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.869443416595459, "num_tokens": 717168255.0, "step": 18794 }, { "epoch": 2.390917186108638, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.10040283203125, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.860299289226532, "num_tokens": 717206863.0, "step": 18795 }, { "epoch": 2.391044396387228, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.24026870727539, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8422313928604126, "num_tokens": 717245374.0, "step": 18796 }, { "epoch": 2.3911716066658184, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.008060455322266, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8810144662857056, "num_tokens": 717279821.0, "step": 18797 }, { "epoch": 2.391298816944409, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.78911590576172, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8541549444198608, "num_tokens": 717318896.0, "step": 18798 }, { "epoch": 2.3914260272229995, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.96104049682617, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8530020713806152, "num_tokens": 717361493.0, "step": 18799 }, { "epoch": 2.39155323750159, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.111392974853516, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8531033992767334, "num_tokens": 717402801.0, "step": 18800 }, { "epoch": 2.3916804477801805, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.37896728515625, "learning_rate": 1e-06, "loss": 0.5433, "mean_token_accuracy": 0.8818368911743164, "num_tokens": 717441275.0, "step": 18801 }, { "epoch": 2.391807658058771, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.534759521484375, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.849441409111023, "num_tokens": 717477423.0, "step": 18802 }, { "epoch": 2.3919348683373616, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.09553909301758, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8558995723724365, "num_tokens": 717512978.0, "step": 18803 }, { "epoch": 2.392062078615952, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.46918869018555, "learning_rate": 1e-06, "loss": 0.5366, "mean_token_accuracy": 0.8818930387496948, "num_tokens": 717549710.0, "step": 18804 }, { "epoch": 2.3921892888945426, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.55352783203125, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8583250641822815, "num_tokens": 717586588.0, "step": 18805 }, { "epoch": 2.392316499173133, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.534671783447266, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8755890130996704, "num_tokens": 717621507.0, "step": 18806 }, { "epoch": 2.3924437094517237, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.5941276550293, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.861236572265625, "num_tokens": 717655915.0, "step": 18807 }, { "epoch": 2.392570919730314, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.4456787109375, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8587840795516968, "num_tokens": 717696357.0, "step": 18808 }, { "epoch": 2.3926981300089047, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.54595184326172, "learning_rate": 1e-06, "loss": 0.566, "mean_token_accuracy": 0.8757328391075134, "num_tokens": 717735397.0, "step": 18809 }, { "epoch": 2.3928253402874953, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.06101608276367, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8670591115951538, "num_tokens": 717770081.0, "step": 18810 }, { "epoch": 2.392952550566086, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.19188690185547, "learning_rate": 1e-06, "loss": 0.5429, "mean_token_accuracy": 0.8788684606552124, "num_tokens": 717805262.0, "step": 18811 }, { "epoch": 2.3930797608446763, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.97048568725586, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8619915246963501, "num_tokens": 717841877.0, "step": 18812 }, { "epoch": 2.393206971123267, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.08542251586914, "learning_rate": 1e-06, "loss": 0.6437, "mean_token_accuracy": 0.846703052520752, "num_tokens": 717881435.0, "step": 18813 }, { "epoch": 2.3933341814018574, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.441226959228516, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8819718360900879, "num_tokens": 717922240.0, "step": 18814 }, { "epoch": 2.393461391680448, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.95343017578125, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8602824211120605, "num_tokens": 717958438.0, "step": 18815 }, { "epoch": 2.3935886019590384, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.57597732543945, "learning_rate": 1e-06, "loss": 0.6297, "mean_token_accuracy": 0.8548780679702759, "num_tokens": 717994451.0, "step": 18816 }, { "epoch": 2.393715812237629, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.95610809326172, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8649106025695801, "num_tokens": 718034318.0, "step": 18817 }, { "epoch": 2.3938430225162195, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.079193115234375, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8627705574035645, "num_tokens": 718074994.0, "step": 18818 }, { "epoch": 2.3939702327948096, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.372650146484375, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8595229387283325, "num_tokens": 718118849.0, "step": 18819 }, { "epoch": 2.3940974430734006, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.116783142089844, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8682968020439148, "num_tokens": 718152874.0, "step": 18820 }, { "epoch": 2.3942246533519906, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.03031539916992, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8548462390899658, "num_tokens": 718194206.0, "step": 18821 }, { "epoch": 2.394351863630581, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.3960075378418, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8664754629135132, "num_tokens": 718227408.0, "step": 18822 }, { "epoch": 2.3944790739091717, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.49160385131836, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.857103168964386, "num_tokens": 718266962.0, "step": 18823 }, { "epoch": 2.3946062841877622, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.26496505737305, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.868272602558136, "num_tokens": 718307873.0, "step": 18824 }, { "epoch": 2.3947334944663528, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.887779235839844, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8690018057823181, "num_tokens": 718350296.0, "step": 18825 }, { "epoch": 2.3948607047449433, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.93581008911133, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8651002645492554, "num_tokens": 718390734.0, "step": 18826 }, { "epoch": 2.394987915023534, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.40079879760742, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8788666129112244, "num_tokens": 718435630.0, "step": 18827 }, { "epoch": 2.3951151253021243, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.66489791870117, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8695870637893677, "num_tokens": 718474791.0, "step": 18828 }, { "epoch": 2.395242335580715, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.928062438964844, "learning_rate": 1e-06, "loss": 0.6594, "mean_token_accuracy": 0.8443832397460938, "num_tokens": 718514957.0, "step": 18829 }, { "epoch": 2.3953695458593054, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.61612319946289, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8743208646774292, "num_tokens": 718551761.0, "step": 18830 }, { "epoch": 2.395496756137896, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.468994140625, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8621766567230225, "num_tokens": 718586445.0, "step": 18831 }, { "epoch": 2.3956239664164865, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.97093200683594, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8587824106216431, "num_tokens": 718631105.0, "step": 18832 }, { "epoch": 2.395751176695077, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.66187286376953, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8598170280456543, "num_tokens": 718672278.0, "step": 18833 }, { "epoch": 2.3958783869736675, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.036277770996094, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.868019700050354, "num_tokens": 718713191.0, "step": 18834 }, { "epoch": 2.396005597252258, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.61678695678711, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8559954166412354, "num_tokens": 718745012.0, "step": 18835 }, { "epoch": 2.3961328075308486, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.023780822753906, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8638665080070496, "num_tokens": 718780149.0, "step": 18836 }, { "epoch": 2.396260017809439, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.44011306762695, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8581578731536865, "num_tokens": 718812831.0, "step": 18837 }, { "epoch": 2.3963872280880296, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.40059280395508, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.872003436088562, "num_tokens": 718847650.0, "step": 18838 }, { "epoch": 2.39651443836662, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.4781608581543, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8636264204978943, "num_tokens": 718883417.0, "step": 18839 }, { "epoch": 2.3966416486452107, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.341556549072266, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8888139724731445, "num_tokens": 718915501.0, "step": 18840 }, { "epoch": 2.396768858923801, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.68141555786133, "learning_rate": 1e-06, "loss": 0.5593, "mean_token_accuracy": 0.878258466720581, "num_tokens": 718954485.0, "step": 18841 }, { "epoch": 2.3968960692023917, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.890872955322266, "learning_rate": 1e-06, "loss": 0.6428, "mean_token_accuracy": 0.8483377695083618, "num_tokens": 718995815.0, "step": 18842 }, { "epoch": 2.3970232794809823, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.81243133544922, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.878693163394928, "num_tokens": 719032820.0, "step": 18843 }, { "epoch": 2.3971504897595723, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.980133056640625, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8613353967666626, "num_tokens": 719066966.0, "step": 18844 }, { "epoch": 2.3972777000381633, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 49.01751708984375, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8615614771842957, "num_tokens": 719108768.0, "step": 18845 }, { "epoch": 2.3974049103167534, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.541770935058594, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8607321977615356, "num_tokens": 719151451.0, "step": 18846 }, { "epoch": 2.397532120595344, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.34409713745117, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8552807569503784, "num_tokens": 719192103.0, "step": 18847 }, { "epoch": 2.3976593308739345, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.39424133300781, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8595408201217651, "num_tokens": 719226514.0, "step": 18848 }, { "epoch": 2.397786541152525, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.50789260864258, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8883394002914429, "num_tokens": 719262944.0, "step": 18849 }, { "epoch": 2.3979137514311155, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.0043830871582, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8679993748664856, "num_tokens": 719306707.0, "step": 18850 }, { "epoch": 2.398040961709706, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.3011589050293, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8685081005096436, "num_tokens": 719347523.0, "step": 18851 }, { "epoch": 2.3981681719882966, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.02018737792969, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8827108144760132, "num_tokens": 719379474.0, "step": 18852 }, { "epoch": 2.398295382266887, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.98060607910156, "learning_rate": 1e-06, "loss": 0.6535, "mean_token_accuracy": 0.8459818363189697, "num_tokens": 719421624.0, "step": 18853 }, { "epoch": 2.3984225925454776, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.319759368896484, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8594595789909363, "num_tokens": 719455570.0, "step": 18854 }, { "epoch": 2.398549802824068, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.80938720703125, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8813170194625854, "num_tokens": 719500300.0, "step": 18855 }, { "epoch": 2.3986770131026587, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.82810592651367, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8481016159057617, "num_tokens": 719538119.0, "step": 18856 }, { "epoch": 2.398804223381249, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.640682220458984, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8520399332046509, "num_tokens": 719574012.0, "step": 18857 }, { "epoch": 2.3989314336598397, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.48390579223633, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.875502347946167, "num_tokens": 719609548.0, "step": 18858 }, { "epoch": 2.3990586439384303, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.36039733886719, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8578907251358032, "num_tokens": 719647280.0, "step": 18859 }, { "epoch": 2.399185854217021, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.49076461791992, "learning_rate": 1e-06, "loss": 0.5426, "mean_token_accuracy": 0.8789831399917603, "num_tokens": 719686987.0, "step": 18860 }, { "epoch": 2.3993130644956113, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.15573501586914, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8813741207122803, "num_tokens": 719727974.0, "step": 18861 }, { "epoch": 2.399440274774202, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.17265319824219, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8625205755233765, "num_tokens": 719764308.0, "step": 18862 }, { "epoch": 2.3995674850527924, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.750972747802734, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8522691130638123, "num_tokens": 719801872.0, "step": 18863 }, { "epoch": 2.399694695331383, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.511356353759766, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8654528260231018, "num_tokens": 719842504.0, "step": 18864 }, { "epoch": 2.3998219056099734, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.06805419921875, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.857575535774231, "num_tokens": 719878885.0, "step": 18865 }, { "epoch": 2.399949115888564, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.041812896728516, "learning_rate": 1e-06, "loss": 0.6494, "mean_token_accuracy": 0.8470726609230042, "num_tokens": 719922891.0, "step": 18866 }, { "epoch": 2.4000763261671545, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.217464447021484, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8617728352546692, "num_tokens": 719960113.0, "step": 18867 }, { "epoch": 2.400203536445745, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.97156524658203, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8634955883026123, "num_tokens": 720006079.0, "step": 18868 }, { "epoch": 2.400330746724335, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.3309326171875, "learning_rate": 1e-06, "loss": 0.6279, "mean_token_accuracy": 0.8497112989425659, "num_tokens": 720042219.0, "step": 18869 }, { "epoch": 2.400457957002926, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.938133239746094, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8643364906311035, "num_tokens": 720082574.0, "step": 18870 }, { "epoch": 2.400585167281516, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.319557189941406, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8598713278770447, "num_tokens": 720115583.0, "step": 18871 }, { "epoch": 2.4007123775601067, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.85712814331055, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8529495596885681, "num_tokens": 720151350.0, "step": 18872 }, { "epoch": 2.400839587838697, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.480491638183594, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8544632196426392, "num_tokens": 720187072.0, "step": 18873 }, { "epoch": 2.4009667981172877, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.7490119934082, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8662624359130859, "num_tokens": 720222249.0, "step": 18874 }, { "epoch": 2.4010940083958783, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.45235824584961, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8615512847900391, "num_tokens": 720264682.0, "step": 18875 }, { "epoch": 2.401221218674469, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.72120666503906, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8601131439208984, "num_tokens": 720309935.0, "step": 18876 }, { "epoch": 2.4013484289530593, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.98270034790039, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8553221225738525, "num_tokens": 720343992.0, "step": 18877 }, { "epoch": 2.40147563923165, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.98358917236328, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8615716099739075, "num_tokens": 720379707.0, "step": 18878 }, { "epoch": 2.4016028495102404, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.12828063964844, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8591954112052917, "num_tokens": 720419796.0, "step": 18879 }, { "epoch": 2.401730059788831, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.3765983581543, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8657558560371399, "num_tokens": 720460919.0, "step": 18880 }, { "epoch": 2.4018572700674214, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.093875885009766, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8580474853515625, "num_tokens": 720498403.0, "step": 18881 }, { "epoch": 2.401984480346012, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.964420318603516, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8620874881744385, "num_tokens": 720534931.0, "step": 18882 }, { "epoch": 2.4021116906246025, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.43134307861328, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8679589033126831, "num_tokens": 720574062.0, "step": 18883 }, { "epoch": 2.402238900903193, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.8975715637207, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.8548756837844849, "num_tokens": 720611552.0, "step": 18884 }, { "epoch": 2.4023661111817836, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.639617919921875, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8621280193328857, "num_tokens": 720647886.0, "step": 18885 }, { "epoch": 2.402493321460374, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.054779052734375, "learning_rate": 1e-06, "loss": 0.569, "mean_token_accuracy": 0.868971049785614, "num_tokens": 720688034.0, "step": 18886 }, { "epoch": 2.4026205317389646, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.42856216430664, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8692770004272461, "num_tokens": 720727805.0, "step": 18887 }, { "epoch": 2.402747742017555, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.27836608886719, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8658508658409119, "num_tokens": 720759717.0, "step": 18888 }, { "epoch": 2.4028749522961457, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.48701095581055, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8560069799423218, "num_tokens": 720790435.0, "step": 18889 }, { "epoch": 2.403002162574736, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 50.00044250488281, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8589507341384888, "num_tokens": 720831715.0, "step": 18890 }, { "epoch": 2.4031293728533267, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.99776840209961, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8622487187385559, "num_tokens": 720869598.0, "step": 18891 }, { "epoch": 2.403256583131917, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 50.124534606933594, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8569674491882324, "num_tokens": 720911122.0, "step": 18892 }, { "epoch": 2.403383793410508, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.74641418457031, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8680820465087891, "num_tokens": 720952265.0, "step": 18893 }, { "epoch": 2.403511003689098, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.97499465942383, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8667964935302734, "num_tokens": 720991191.0, "step": 18894 }, { "epoch": 2.4036382139676884, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.92550277709961, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8732513189315796, "num_tokens": 721032524.0, "step": 18895 }, { "epoch": 2.403765424246279, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.563602447509766, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8598970174789429, "num_tokens": 721072699.0, "step": 18896 }, { "epoch": 2.4038926345248695, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.249942779541016, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8756149411201477, "num_tokens": 721109467.0, "step": 18897 }, { "epoch": 2.40401984480346, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.37593078613281, "learning_rate": 1e-06, "loss": 0.6886, "mean_token_accuracy": 0.8365483283996582, "num_tokens": 721142108.0, "step": 18898 }, { "epoch": 2.4041470550820505, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.417884826660156, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.8558206558227539, "num_tokens": 721179875.0, "step": 18899 }, { "epoch": 2.404274265360641, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.4472541809082, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8593134880065918, "num_tokens": 721217141.0, "step": 18900 }, { "epoch": 2.4044014756392316, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.181785583496094, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8567503690719604, "num_tokens": 721256357.0, "step": 18901 }, { "epoch": 2.404528685917822, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.47516632080078, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8611158132553101, "num_tokens": 721292610.0, "step": 18902 }, { "epoch": 2.4046558961964126, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.50407791137695, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8677198886871338, "num_tokens": 721329048.0, "step": 18903 }, { "epoch": 2.404783106475003, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.745147705078125, "learning_rate": 1e-06, "loss": 0.5483, "mean_token_accuracy": 0.8770710229873657, "num_tokens": 721364249.0, "step": 18904 }, { "epoch": 2.4049103167535937, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.88323974609375, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8700259923934937, "num_tokens": 721391154.0, "step": 18905 }, { "epoch": 2.405037527032184, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.71628952026367, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8724446892738342, "num_tokens": 721428443.0, "step": 18906 }, { "epoch": 2.4051647373107747, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.978057861328125, "learning_rate": 1e-06, "loss": 0.6374, "mean_token_accuracy": 0.8522367477416992, "num_tokens": 721474545.0, "step": 18907 }, { "epoch": 2.4052919475893653, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.07451248168945, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8626018762588501, "num_tokens": 721513153.0, "step": 18908 }, { "epoch": 2.405419157867956, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.48688888549805, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8600289821624756, "num_tokens": 721556287.0, "step": 18909 }, { "epoch": 2.4055463681465463, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.38277816772461, "learning_rate": 1e-06, "loss": 0.5679, "mean_token_accuracy": 0.8695279359817505, "num_tokens": 721590219.0, "step": 18910 }, { "epoch": 2.405673578425137, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.95730209350586, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8621636629104614, "num_tokens": 721627308.0, "step": 18911 }, { "epoch": 2.4058007887037274, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.39480972290039, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8581779599189758, "num_tokens": 721664218.0, "step": 18912 }, { "epoch": 2.405927998982318, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.35752487182617, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8713874220848083, "num_tokens": 721703511.0, "step": 18913 }, { "epoch": 2.4060552092609084, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.994205474853516, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8756063580513, "num_tokens": 721741512.0, "step": 18914 }, { "epoch": 2.406182419539499, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.98855209350586, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8674590587615967, "num_tokens": 721778443.0, "step": 18915 }, { "epoch": 2.4063096298180895, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.631656646728516, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8590443134307861, "num_tokens": 721817226.0, "step": 18916 }, { "epoch": 2.4064368400966796, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.80558776855469, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8474442958831787, "num_tokens": 721859419.0, "step": 18917 }, { "epoch": 2.4065640503752705, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.48467254638672, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8758606910705566, "num_tokens": 721895183.0, "step": 18918 }, { "epoch": 2.4066912606538606, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.91889953613281, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8481903076171875, "num_tokens": 721933566.0, "step": 18919 }, { "epoch": 2.406818470932451, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.96522521972656, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8650695085525513, "num_tokens": 721968827.0, "step": 18920 }, { "epoch": 2.4069456812110417, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.49786376953125, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8666542768478394, "num_tokens": 722004989.0, "step": 18921 }, { "epoch": 2.407072891489632, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.713348388671875, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8676034212112427, "num_tokens": 722043681.0, "step": 18922 }, { "epoch": 2.4072001017682227, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.66568374633789, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8629941940307617, "num_tokens": 722086541.0, "step": 18923 }, { "epoch": 2.4073273120468133, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.81748962402344, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8643912672996521, "num_tokens": 722125519.0, "step": 18924 }, { "epoch": 2.407454522325404, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.62093734741211, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.857847273349762, "num_tokens": 722157665.0, "step": 18925 }, { "epoch": 2.4075817326039943, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.686317443847656, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8756344318389893, "num_tokens": 722191819.0, "step": 18926 }, { "epoch": 2.407708942882585, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.78342056274414, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8529039025306702, "num_tokens": 722229755.0, "step": 18927 }, { "epoch": 2.4078361531611754, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.39926528930664, "learning_rate": 1e-06, "loss": 0.6329, "mean_token_accuracy": 0.8532017469406128, "num_tokens": 722267440.0, "step": 18928 }, { "epoch": 2.407963363439766, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.511600494384766, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.859808087348938, "num_tokens": 722310102.0, "step": 18929 }, { "epoch": 2.4080905737183564, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.922401428222656, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8562072515487671, "num_tokens": 722346713.0, "step": 18930 }, { "epoch": 2.408217783996947, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.07285690307617, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.857077419757843, "num_tokens": 722387803.0, "step": 18931 }, { "epoch": 2.4083449942755375, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.354434967041016, "learning_rate": 1e-06, "loss": 0.641, "mean_token_accuracy": 0.854515552520752, "num_tokens": 722428943.0, "step": 18932 }, { "epoch": 2.408472204554128, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.225807189941406, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8562202453613281, "num_tokens": 722467860.0, "step": 18933 }, { "epoch": 2.4085994148327186, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.807491302490234, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8596681952476501, "num_tokens": 722503067.0, "step": 18934 }, { "epoch": 2.408726625111309, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.66588592529297, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8650255799293518, "num_tokens": 722537842.0, "step": 18935 }, { "epoch": 2.4088538353898996, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.18156433105469, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8681128025054932, "num_tokens": 722574036.0, "step": 18936 }, { "epoch": 2.40898104566849, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.56560134887695, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8630691170692444, "num_tokens": 722606087.0, "step": 18937 }, { "epoch": 2.4091082559470807, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.0738525390625, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8609821796417236, "num_tokens": 722648745.0, "step": 18938 }, { "epoch": 2.409235466225671, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.64323043823242, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8539330363273621, "num_tokens": 722688589.0, "step": 18939 }, { "epoch": 2.4093626765042617, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.58268356323242, "learning_rate": 1e-06, "loss": 0.5509, "mean_token_accuracy": 0.8812612295150757, "num_tokens": 722723941.0, "step": 18940 }, { "epoch": 2.4094898867828523, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.7073974609375, "learning_rate": 1e-06, "loss": 0.6499, "mean_token_accuracy": 0.8468462228775024, "num_tokens": 722768242.0, "step": 18941 }, { "epoch": 2.4096170970614423, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.433650970458984, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8641522526741028, "num_tokens": 722800970.0, "step": 18942 }, { "epoch": 2.4097443073400333, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.345394134521484, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8528074026107788, "num_tokens": 722839168.0, "step": 18943 }, { "epoch": 2.4098715176186234, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.55022048950195, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8732337951660156, "num_tokens": 722883451.0, "step": 18944 }, { "epoch": 2.409998727897214, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.56412887573242, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.86370849609375, "num_tokens": 722928581.0, "step": 18945 }, { "epoch": 2.4101259381758044, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.16102600097656, "learning_rate": 1e-06, "loss": 0.5566, "mean_token_accuracy": 0.873704731464386, "num_tokens": 722967141.0, "step": 18946 }, { "epoch": 2.410253148454395, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.60140609741211, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8584147691726685, "num_tokens": 723006263.0, "step": 18947 }, { "epoch": 2.4103803587329855, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.554195404052734, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8651418685913086, "num_tokens": 723050843.0, "step": 18948 }, { "epoch": 2.410507569011576, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.77870559692383, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8620507121086121, "num_tokens": 723092124.0, "step": 18949 }, { "epoch": 2.4106347792901666, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.2762336730957, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8737035393714905, "num_tokens": 723122478.0, "step": 18950 }, { "epoch": 2.410761989568757, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.69855499267578, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.856844425201416, "num_tokens": 723163054.0, "step": 18951 }, { "epoch": 2.4108891998473476, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.26890182495117, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8635101318359375, "num_tokens": 723201518.0, "step": 18952 }, { "epoch": 2.411016410125938, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.9158821105957, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8641766309738159, "num_tokens": 723235685.0, "step": 18953 }, { "epoch": 2.4111436204045287, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.29970169067383, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8617410659790039, "num_tokens": 723266510.0, "step": 18954 }, { "epoch": 2.411270830683119, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 50.0335578918457, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8655967116355896, "num_tokens": 723302800.0, "step": 18955 }, { "epoch": 2.4113980409617097, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.32006072998047, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.864741861820221, "num_tokens": 723340669.0, "step": 18956 }, { "epoch": 2.4115252512403003, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.37126541137695, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8652231097221375, "num_tokens": 723377556.0, "step": 18957 }, { "epoch": 2.411652461518891, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.894927978515625, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8678068518638611, "num_tokens": 723410942.0, "step": 18958 }, { "epoch": 2.4117796717974813, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.26860809326172, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8735017776489258, "num_tokens": 723451131.0, "step": 18959 }, { "epoch": 2.411906882076072, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.72962951660156, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8650707602500916, "num_tokens": 723485876.0, "step": 18960 }, { "epoch": 2.4120340923546624, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.91856002807617, "learning_rate": 1e-06, "loss": 0.5578, "mean_token_accuracy": 0.8700757026672363, "num_tokens": 723521252.0, "step": 18961 }, { "epoch": 2.412161302633253, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.81938934326172, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.8461472988128662, "num_tokens": 723561094.0, "step": 18962 }, { "epoch": 2.4122885129118434, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.82808303833008, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8618486523628235, "num_tokens": 723597281.0, "step": 18963 }, { "epoch": 2.412415723190434, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.46421813964844, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.886238694190979, "num_tokens": 723637690.0, "step": 18964 }, { "epoch": 2.4125429334690245, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.63201904296875, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8627082109451294, "num_tokens": 723676250.0, "step": 18965 }, { "epoch": 2.412670143747615, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.76411819458008, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8703952431678772, "num_tokens": 723713157.0, "step": 18966 }, { "epoch": 2.412797354026205, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 50.05888748168945, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8680680394172668, "num_tokens": 723745877.0, "step": 18967 }, { "epoch": 2.412924564304796, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.26875305175781, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8642857074737549, "num_tokens": 723784285.0, "step": 18968 }, { "epoch": 2.413051774583386, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 50.07683181762695, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.8518673181533813, "num_tokens": 723817085.0, "step": 18969 }, { "epoch": 2.4131789848619767, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.72401809692383, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.857628583908081, "num_tokens": 723858466.0, "step": 18970 }, { "epoch": 2.413306195140567, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.574520111083984, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.862591028213501, "num_tokens": 723896627.0, "step": 18971 }, { "epoch": 2.4134334054191577, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.87084197998047, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8593345284461975, "num_tokens": 723930727.0, "step": 18972 }, { "epoch": 2.4135606156977483, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.36191177368164, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8513083457946777, "num_tokens": 723965996.0, "step": 18973 }, { "epoch": 2.413687825976339, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 50.456016540527344, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.858533501625061, "num_tokens": 724002441.0, "step": 18974 }, { "epoch": 2.4138150362549293, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.0485725402832, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8668848872184753, "num_tokens": 724039238.0, "step": 18975 }, { "epoch": 2.41394224653352, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.07285690307617, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8687044978141785, "num_tokens": 724077430.0, "step": 18976 }, { "epoch": 2.4140694568121104, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 49.12606430053711, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8615583181381226, "num_tokens": 724113175.0, "step": 18977 }, { "epoch": 2.414196667090701, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.978309631347656, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8577080368995667, "num_tokens": 724156409.0, "step": 18978 }, { "epoch": 2.4143238773692914, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.905792236328125, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8628743886947632, "num_tokens": 724199879.0, "step": 18979 }, { "epoch": 2.414451087647882, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.51512908935547, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8774621486663818, "num_tokens": 724238065.0, "step": 18980 }, { "epoch": 2.4145782979264725, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.58600997924805, "learning_rate": 1e-06, "loss": 0.623, "mean_token_accuracy": 0.8545642495155334, "num_tokens": 724272336.0, "step": 18981 }, { "epoch": 2.414705508205063, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.931392669677734, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.85417640209198, "num_tokens": 724305200.0, "step": 18982 }, { "epoch": 2.4148327184836536, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.384212493896484, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8657877445220947, "num_tokens": 724341295.0, "step": 18983 }, { "epoch": 2.414959928762244, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.74444580078125, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.865278959274292, "num_tokens": 724384336.0, "step": 18984 }, { "epoch": 2.4150871390408346, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.10832595825195, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8586512804031372, "num_tokens": 724417279.0, "step": 18985 }, { "epoch": 2.415214349319425, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.54996109008789, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8629735708236694, "num_tokens": 724456509.0, "step": 18986 }, { "epoch": 2.4153415595980157, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.46514892578125, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8740805387496948, "num_tokens": 724498812.0, "step": 18987 }, { "epoch": 2.415468769876606, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.52753448486328, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8502588272094727, "num_tokens": 724538174.0, "step": 18988 }, { "epoch": 2.4155959801551967, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.26475143432617, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8673680424690247, "num_tokens": 724577171.0, "step": 18989 }, { "epoch": 2.415723190433787, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.68189239501953, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8841384649276733, "num_tokens": 724610626.0, "step": 18990 }, { "epoch": 2.4158504007123778, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.68694305419922, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8749517202377319, "num_tokens": 724647613.0, "step": 18991 }, { "epoch": 2.415977610990968, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.424007415771484, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8648406267166138, "num_tokens": 724684898.0, "step": 18992 }, { "epoch": 2.4161048212695584, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.56490707397461, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8594399094581604, "num_tokens": 724725566.0, "step": 18993 }, { "epoch": 2.416232031548149, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.75120544433594, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.854843020439148, "num_tokens": 724763701.0, "step": 18994 }, { "epoch": 2.4163592418267394, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.09195327758789, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8615111112594604, "num_tokens": 724798108.0, "step": 18995 }, { "epoch": 2.41648645210533, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.384361267089844, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8404281139373779, "num_tokens": 724842430.0, "step": 18996 }, { "epoch": 2.4166136623839205, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 48.811222076416016, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8588037490844727, "num_tokens": 724884585.0, "step": 18997 }, { "epoch": 2.416740872662511, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.041725158691406, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8740017414093018, "num_tokens": 724919338.0, "step": 18998 }, { "epoch": 2.4168680829411016, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.317176818847656, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8850106000900269, "num_tokens": 724956007.0, "step": 18999 }, { "epoch": 2.416995293219692, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.76476287841797, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8791545033454895, "num_tokens": 724996994.0, "step": 19000 }, { "epoch": 2.4171225034982826, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.88359832763672, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8560250997543335, "num_tokens": 725035607.0, "step": 19001 }, { "epoch": 2.417249713776873, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.23031997680664, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8665638566017151, "num_tokens": 725075370.0, "step": 19002 }, { "epoch": 2.4173769240554637, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.41080093383789, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8618987202644348, "num_tokens": 725119536.0, "step": 19003 }, { "epoch": 2.417504134334054, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.56742477416992, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8631775379180908, "num_tokens": 725158048.0, "step": 19004 }, { "epoch": 2.4176313446126447, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.545963287353516, "learning_rate": 1e-06, "loss": 0.5667, "mean_token_accuracy": 0.8730151653289795, "num_tokens": 725196343.0, "step": 19005 }, { "epoch": 2.4177585548912353, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.43886184692383, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.866060197353363, "num_tokens": 725234613.0, "step": 19006 }, { "epoch": 2.417885765169826, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 50.10438919067383, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.862284779548645, "num_tokens": 725272129.0, "step": 19007 }, { "epoch": 2.4180129754484163, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.621360778808594, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8537384271621704, "num_tokens": 725305740.0, "step": 19008 }, { "epoch": 2.418140185727007, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.23787307739258, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8586379289627075, "num_tokens": 725340579.0, "step": 19009 }, { "epoch": 2.4182673960055974, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.34629440307617, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8652601838111877, "num_tokens": 725380492.0, "step": 19010 }, { "epoch": 2.418394606284188, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.93528747558594, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8630313873291016, "num_tokens": 725418043.0, "step": 19011 }, { "epoch": 2.4185218165627784, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.88037109375, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8549691438674927, "num_tokens": 725456260.0, "step": 19012 }, { "epoch": 2.418649026841369, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.427494049072266, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8598392009735107, "num_tokens": 725496605.0, "step": 19013 }, { "epoch": 2.4187762371199595, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 50.08245086669922, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.864061713218689, "num_tokens": 725532395.0, "step": 19014 }, { "epoch": 2.4189034473985496, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.65666580200195, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8577461242675781, "num_tokens": 725575335.0, "step": 19015 }, { "epoch": 2.4190306576771405, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.95423126220703, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8707761168479919, "num_tokens": 725612345.0, "step": 19016 }, { "epoch": 2.4191578679557306, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.25075912475586, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8682213425636292, "num_tokens": 725645566.0, "step": 19017 }, { "epoch": 2.419285078234321, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.751136779785156, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8577751517295837, "num_tokens": 725684218.0, "step": 19018 }, { "epoch": 2.4194122885129117, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 48.97934341430664, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8666458129882812, "num_tokens": 725718478.0, "step": 19019 }, { "epoch": 2.419539498791502, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.35179138183594, "learning_rate": 1e-06, "loss": 0.6321, "mean_token_accuracy": 0.8558973670005798, "num_tokens": 725756208.0, "step": 19020 }, { "epoch": 2.4196667090700927, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.115291595458984, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8579270243644714, "num_tokens": 725791884.0, "step": 19021 }, { "epoch": 2.4197939193486833, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.279808044433594, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8679554462432861, "num_tokens": 725829138.0, "step": 19022 }, { "epoch": 2.419921129627274, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.88157653808594, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.864026665687561, "num_tokens": 725868301.0, "step": 19023 }, { "epoch": 2.4200483399058643, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.446197509765625, "learning_rate": 1e-06, "loss": 0.6468, "mean_token_accuracy": 0.8515809178352356, "num_tokens": 725906186.0, "step": 19024 }, { "epoch": 2.420175550184455, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.89069366455078, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8505539298057556, "num_tokens": 725947006.0, "step": 19025 }, { "epoch": 2.4203027604630454, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.44620132446289, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.866051435470581, "num_tokens": 725987363.0, "step": 19026 }, { "epoch": 2.420429970741636, "ewc_loss": 0.1904296875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016689300537109375, "grad_norm": 48.94429016113281, "learning_rate": 1e-06, "loss": 0.6245, "mean_token_accuracy": 0.8537147045135498, "num_tokens": 726026176.0, "step": 19027 }, { "epoch": 2.4205571810202264, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.07100296020508, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8629574179649353, "num_tokens": 726068124.0, "step": 19028 }, { "epoch": 2.420684391298817, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.17451095581055, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8544408082962036, "num_tokens": 726110270.0, "step": 19029 }, { "epoch": 2.4208116015774075, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.4923095703125, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8796080946922302, "num_tokens": 726143908.0, "step": 19030 }, { "epoch": 2.420938811855998, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.39878845214844, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8609567284584045, "num_tokens": 726179082.0, "step": 19031 }, { "epoch": 2.4210660221345885, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.13015365600586, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8543164730072021, "num_tokens": 726216387.0, "step": 19032 }, { "epoch": 2.421193232413179, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.16887283325195, "learning_rate": 1e-06, "loss": 0.5713, "mean_token_accuracy": 0.8729150891304016, "num_tokens": 726258831.0, "step": 19033 }, { "epoch": 2.4213204426917696, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.14041519165039, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8638391494750977, "num_tokens": 726301929.0, "step": 19034 }, { "epoch": 2.42144765297036, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.15419006347656, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8653199076652527, "num_tokens": 726339938.0, "step": 19035 }, { "epoch": 2.4215748632489507, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.10837936401367, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8653041124343872, "num_tokens": 726377197.0, "step": 19036 }, { "epoch": 2.421702073527541, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.34207534790039, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8664491772651672, "num_tokens": 726417591.0, "step": 19037 }, { "epoch": 2.4218292838061317, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.1007080078125, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8537132740020752, "num_tokens": 726456094.0, "step": 19038 }, { "epoch": 2.4219564940847222, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.72372817993164, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8713825941085815, "num_tokens": 726491158.0, "step": 19039 }, { "epoch": 2.4220837043633123, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.18975830078125, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.849367082118988, "num_tokens": 726531334.0, "step": 19040 }, { "epoch": 2.4222109146419033, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.66189956665039, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8551982641220093, "num_tokens": 726571999.0, "step": 19041 }, { "epoch": 2.4223381249204934, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.708065032958984, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8545008897781372, "num_tokens": 726604945.0, "step": 19042 }, { "epoch": 2.422465335199084, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.86056900024414, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.854133129119873, "num_tokens": 726645023.0, "step": 19043 }, { "epoch": 2.4225925454776744, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.79258346557617, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8624106645584106, "num_tokens": 726684297.0, "step": 19044 }, { "epoch": 2.422719755756265, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.08640670776367, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8524904251098633, "num_tokens": 726726950.0, "step": 19045 }, { "epoch": 2.4228469660348555, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.298919677734375, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.8436278700828552, "num_tokens": 726761944.0, "step": 19046 }, { "epoch": 2.422974176313446, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.2430305480957, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8725394010543823, "num_tokens": 726799041.0, "step": 19047 }, { "epoch": 2.4231013865920366, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.630767822265625, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8565516471862793, "num_tokens": 726838160.0, "step": 19048 }, { "epoch": 2.423228596870627, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.71818161010742, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8727579116821289, "num_tokens": 726873150.0, "step": 19049 }, { "epoch": 2.4233558071492176, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.76791763305664, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.867384672164917, "num_tokens": 726905253.0, "step": 19050 }, { "epoch": 2.423483017427808, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.41844177246094, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8747066855430603, "num_tokens": 726942842.0, "step": 19051 }, { "epoch": 2.4236102277063987, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.491390228271484, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8678853511810303, "num_tokens": 726987974.0, "step": 19052 }, { "epoch": 2.423737437984989, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.89302062988281, "learning_rate": 1e-06, "loss": 0.543, "mean_token_accuracy": 0.8818042278289795, "num_tokens": 727021614.0, "step": 19053 }, { "epoch": 2.4238646482635797, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.573387145996094, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8613185286521912, "num_tokens": 727061773.0, "step": 19054 }, { "epoch": 2.4239918585421703, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.81753921508789, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8565120100975037, "num_tokens": 727095877.0, "step": 19055 }, { "epoch": 2.424119068820761, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.383323669433594, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8666985034942627, "num_tokens": 727128676.0, "step": 19056 }, { "epoch": 2.4242462790993513, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.81642150878906, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.8719080686569214, "num_tokens": 727164528.0, "step": 19057 }, { "epoch": 2.424373489377942, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.27579879760742, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8702682852745056, "num_tokens": 727202403.0, "step": 19058 }, { "epoch": 2.4245006996565324, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.999107360839844, "learning_rate": 1e-06, "loss": 0.5721, "mean_token_accuracy": 0.8667404651641846, "num_tokens": 727242231.0, "step": 19059 }, { "epoch": 2.424627909935123, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.430328369140625, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8465030193328857, "num_tokens": 727276371.0, "step": 19060 }, { "epoch": 2.4247551202137134, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.88912582397461, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8663452863693237, "num_tokens": 727320101.0, "step": 19061 }, { "epoch": 2.424882330492304, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.67017364501953, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8539907932281494, "num_tokens": 727363687.0, "step": 19062 }, { "epoch": 2.4250095407708945, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.41835021972656, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8673102259635925, "num_tokens": 727399772.0, "step": 19063 }, { "epoch": 2.425136751049485, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.33369827270508, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8726863861083984, "num_tokens": 727442944.0, "step": 19064 }, { "epoch": 2.425263961328075, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.458290100097656, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8722584247589111, "num_tokens": 727483283.0, "step": 19065 }, { "epoch": 2.425391171606666, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.28887939453125, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8624277710914612, "num_tokens": 727526316.0, "step": 19066 }, { "epoch": 2.425518381885256, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.47865295410156, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8703538179397583, "num_tokens": 727563218.0, "step": 19067 }, { "epoch": 2.4256455921638467, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 48.893890380859375, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.876169741153717, "num_tokens": 727594877.0, "step": 19068 }, { "epoch": 2.425772802442437, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.34128952026367, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8755340576171875, "num_tokens": 727627561.0, "step": 19069 }, { "epoch": 2.4259000127210277, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 50.24176025390625, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.8556895852088928, "num_tokens": 727664384.0, "step": 19070 }, { "epoch": 2.4260272229996183, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.681312561035156, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8605883121490479, "num_tokens": 727700866.0, "step": 19071 }, { "epoch": 2.426154433278209, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.23410415649414, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8821530938148499, "num_tokens": 727741539.0, "step": 19072 }, { "epoch": 2.4262816435567993, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.50773239135742, "learning_rate": 1e-06, "loss": 0.5819, "mean_token_accuracy": 0.8639235496520996, "num_tokens": 727777828.0, "step": 19073 }, { "epoch": 2.42640885383539, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.00334548950195, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.8722039461135864, "num_tokens": 727818979.0, "step": 19074 }, { "epoch": 2.4265360641139804, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.97611999511719, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8622597455978394, "num_tokens": 727856440.0, "step": 19075 }, { "epoch": 2.426663274392571, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.71416473388672, "learning_rate": 1e-06, "loss": 0.5665, "mean_token_accuracy": 0.8733636736869812, "num_tokens": 727895036.0, "step": 19076 }, { "epoch": 2.4267904846711614, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.73344421386719, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.850788950920105, "num_tokens": 727932292.0, "step": 19077 }, { "epoch": 2.426917694949752, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.06358337402344, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8638827800750732, "num_tokens": 727967005.0, "step": 19078 }, { "epoch": 2.4270449052283425, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.55247116088867, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8543710112571716, "num_tokens": 728004262.0, "step": 19079 }, { "epoch": 2.427172115506933, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.378929138183594, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8717977404594421, "num_tokens": 728041451.0, "step": 19080 }, { "epoch": 2.4272993257855235, "ewc_loss": 0.1943359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017070770263671875, "grad_norm": 49.615718841552734, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8615333437919617, "num_tokens": 728079217.0, "step": 19081 }, { "epoch": 2.427426536064114, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.226070404052734, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8798639178276062, "num_tokens": 728116221.0, "step": 19082 }, { "epoch": 2.4275537463427046, "ewc_loss": 0.1923828125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00016880035400390625, "grad_norm": 49.59817123413086, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8539261221885681, "num_tokens": 728154861.0, "step": 19083 }, { "epoch": 2.427680956621295, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.57778549194336, "learning_rate": 1e-06, "loss": 0.6439, "mean_token_accuracy": 0.8522750735282898, "num_tokens": 728188276.0, "step": 19084 }, { "epoch": 2.4278081668998857, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 48.575927734375, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8751213550567627, "num_tokens": 728229313.0, "step": 19085 }, { "epoch": 2.427935377178476, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 51.55504608154297, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.85386061668396, "num_tokens": 728268706.0, "step": 19086 }, { "epoch": 2.4280625874570667, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.552703857421875, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8766992092132568, "num_tokens": 728297490.0, "step": 19087 }, { "epoch": 2.428189797735657, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.89377212524414, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8570988178253174, "num_tokens": 728336041.0, "step": 19088 }, { "epoch": 2.4283170080142478, "ewc_loss": 0.19140625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001678466796875, "grad_norm": 49.26688003540039, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.870287299156189, "num_tokens": 728372701.0, "step": 19089 }, { "epoch": 2.428444218292838, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.64622497558594, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8671290278434753, "num_tokens": 728408138.0, "step": 19090 }, { "epoch": 2.4285714285714284, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.271873474121094, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8706311583518982, "num_tokens": 728452649.0, "step": 19091 }, { "epoch": 2.428698638850019, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.30827713012695, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8543189764022827, "num_tokens": 728490551.0, "step": 19092 }, { "epoch": 2.4288258491286094, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.63190460205078, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.8680710792541504, "num_tokens": 728529558.0, "step": 19093 }, { "epoch": 2.4289530594072, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.834842681884766, "learning_rate": 1e-06, "loss": 0.6813, "mean_token_accuracy": 0.838645339012146, "num_tokens": 728573495.0, "step": 19094 }, { "epoch": 2.4290802696857905, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.76847839355469, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8634783029556274, "num_tokens": 728609166.0, "step": 19095 }, { "epoch": 2.429207479964381, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.0699462890625, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8713554739952087, "num_tokens": 728653070.0, "step": 19096 }, { "epoch": 2.4293346902429716, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.077884674072266, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8666036128997803, "num_tokens": 728689142.0, "step": 19097 }, { "epoch": 2.429461900521562, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.14438247680664, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8673439621925354, "num_tokens": 728725615.0, "step": 19098 }, { "epoch": 2.4295891108001526, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.90733337402344, "learning_rate": 1e-06, "loss": 0.6336, "mean_token_accuracy": 0.8522461652755737, "num_tokens": 728762366.0, "step": 19099 }, { "epoch": 2.429716321078743, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.38078689575195, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.8525158762931824, "num_tokens": 728801951.0, "step": 19100 }, { "epoch": 2.4298435313573337, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.20306396484375, "learning_rate": 1e-06, "loss": 0.5651, "mean_token_accuracy": 0.8722001314163208, "num_tokens": 728836403.0, "step": 19101 }, { "epoch": 2.429970741635924, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.47275161743164, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8718219995498657, "num_tokens": 728878485.0, "step": 19102 }, { "epoch": 2.4300979519145147, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.02942657470703, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8744568824768066, "num_tokens": 728915935.0, "step": 19103 }, { "epoch": 2.4302251621931052, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.3616828918457, "learning_rate": 1e-06, "loss": 0.6643, "mean_token_accuracy": 0.8456488251686096, "num_tokens": 728956847.0, "step": 19104 }, { "epoch": 2.4303523724716958, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.42854690551758, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8683457374572754, "num_tokens": 728994865.0, "step": 19105 }, { "epoch": 2.4304795827502863, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.41059112548828, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8546023368835449, "num_tokens": 729033030.0, "step": 19106 }, { "epoch": 2.430606793028877, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.64814758300781, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8737577199935913, "num_tokens": 729070430.0, "step": 19107 }, { "epoch": 2.4307340033074674, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.70920944213867, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8739334344863892, "num_tokens": 729102127.0, "step": 19108 }, { "epoch": 2.430861213586058, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.30465316772461, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.866203784942627, "num_tokens": 729138128.0, "step": 19109 }, { "epoch": 2.4309884238646484, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.06863784790039, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8595327138900757, "num_tokens": 729179756.0, "step": 19110 }, { "epoch": 2.431115634143239, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.40634536743164, "learning_rate": 1e-06, "loss": 0.6427, "mean_token_accuracy": 0.8514063358306885, "num_tokens": 729221356.0, "step": 19111 }, { "epoch": 2.4312428444218295, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.45786666870117, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8661508560180664, "num_tokens": 729260456.0, "step": 19112 }, { "epoch": 2.4313700547004196, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.426753997802734, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8532236814498901, "num_tokens": 729299654.0, "step": 19113 }, { "epoch": 2.4314972649790105, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.69386291503906, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8664393424987793, "num_tokens": 729341296.0, "step": 19114 }, { "epoch": 2.4316244752576006, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.65807342529297, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8664093017578125, "num_tokens": 729379632.0, "step": 19115 }, { "epoch": 2.431751685536191, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.50962448120117, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8647284507751465, "num_tokens": 729423202.0, "step": 19116 }, { "epoch": 2.4318788958147817, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.07219314575195, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8831316232681274, "num_tokens": 729457108.0, "step": 19117 }, { "epoch": 2.432006106093372, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.10401153564453, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8595543503761292, "num_tokens": 729494065.0, "step": 19118 }, { "epoch": 2.4321333163719627, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.821407318115234, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8690674901008606, "num_tokens": 729536621.0, "step": 19119 }, { "epoch": 2.4322605266505533, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.809749603271484, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8649166822433472, "num_tokens": 729574423.0, "step": 19120 }, { "epoch": 2.432387736929144, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.21492385864258, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8585482835769653, "num_tokens": 729614673.0, "step": 19121 }, { "epoch": 2.4325149472077343, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.87476348876953, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8622817993164062, "num_tokens": 729653049.0, "step": 19122 }, { "epoch": 2.432642157486325, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.85681915283203, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8626731634140015, "num_tokens": 729689667.0, "step": 19123 }, { "epoch": 2.4327693677649154, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.55879592895508, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8681955337524414, "num_tokens": 729734747.0, "step": 19124 }, { "epoch": 2.432896578043506, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.95676040649414, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8585328459739685, "num_tokens": 729770135.0, "step": 19125 }, { "epoch": 2.4330237883220964, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.68502426147461, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8614699244499207, "num_tokens": 729809203.0, "step": 19126 }, { "epoch": 2.433150998600687, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.95907211303711, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8696193695068359, "num_tokens": 729850164.0, "step": 19127 }, { "epoch": 2.4332782088792775, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.51193618774414, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8511433005332947, "num_tokens": 729884337.0, "step": 19128 }, { "epoch": 2.433405419157868, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.022552490234375, "learning_rate": 1e-06, "loss": 0.5623, "mean_token_accuracy": 0.8767793774604797, "num_tokens": 729924098.0, "step": 19129 }, { "epoch": 2.4335326294364585, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.6462516784668, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8824834227561951, "num_tokens": 729960671.0, "step": 19130 }, { "epoch": 2.433659839715049, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.14510726928711, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8648912310600281, "num_tokens": 730000780.0, "step": 19131 }, { "epoch": 2.4337870499936396, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.82733917236328, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8607057929039001, "num_tokens": 730045198.0, "step": 19132 }, { "epoch": 2.43391426027223, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.96746826171875, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8580955266952515, "num_tokens": 730085010.0, "step": 19133 }, { "epoch": 2.4340414705508207, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.77482223510742, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8579699993133545, "num_tokens": 730121512.0, "step": 19134 }, { "epoch": 2.434168680829411, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.84988784790039, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8630906939506531, "num_tokens": 730164859.0, "step": 19135 }, { "epoch": 2.4342958911080017, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.09971618652344, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8645398616790771, "num_tokens": 730202273.0, "step": 19136 }, { "epoch": 2.4344231013865922, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.040287017822266, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8552451133728027, "num_tokens": 730248208.0, "step": 19137 }, { "epoch": 2.4345503116651823, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.326786041259766, "learning_rate": 1e-06, "loss": 0.5811, "mean_token_accuracy": 0.8723676800727844, "num_tokens": 730291084.0, "step": 19138 }, { "epoch": 2.4346775219437733, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.075233459472656, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8697456121444702, "num_tokens": 730334504.0, "step": 19139 }, { "epoch": 2.4348047322223634, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.634735107421875, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8672659993171692, "num_tokens": 730373356.0, "step": 19140 }, { "epoch": 2.434931942500954, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.47169876098633, "learning_rate": 1e-06, "loss": 0.6453, "mean_token_accuracy": 0.849529504776001, "num_tokens": 730416229.0, "step": 19141 }, { "epoch": 2.4350591527795444, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.151180267333984, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.875874936580658, "num_tokens": 730458922.0, "step": 19142 }, { "epoch": 2.435186363058135, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.0816535949707, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8615911602973938, "num_tokens": 730494583.0, "step": 19143 }, { "epoch": 2.4353135733367255, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.14382553100586, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8652976751327515, "num_tokens": 730529526.0, "step": 19144 }, { "epoch": 2.435440783615316, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.209327697753906, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8662817478179932, "num_tokens": 730558584.0, "step": 19145 }, { "epoch": 2.4355679938939065, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.24278259277344, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8619126677513123, "num_tokens": 730598677.0, "step": 19146 }, { "epoch": 2.435695204172497, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.83938980102539, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8738840818405151, "num_tokens": 730636985.0, "step": 19147 }, { "epoch": 2.4358224144510876, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.09575271606445, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.845556378364563, "num_tokens": 730675409.0, "step": 19148 }, { "epoch": 2.435949624729678, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.17527389526367, "learning_rate": 1e-06, "loss": 0.6321, "mean_token_accuracy": 0.8520904779434204, "num_tokens": 730714739.0, "step": 19149 }, { "epoch": 2.4360768350082687, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.911705017089844, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8622304201126099, "num_tokens": 730757059.0, "step": 19150 }, { "epoch": 2.436204045286859, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.458251953125, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8619413375854492, "num_tokens": 730793469.0, "step": 19151 }, { "epoch": 2.4363312555654497, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.00498580932617, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8690298199653625, "num_tokens": 730829336.0, "step": 19152 }, { "epoch": 2.4364584658440402, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.51936340332031, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8614641427993774, "num_tokens": 730869304.0, "step": 19153 }, { "epoch": 2.4365856761226308, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.522457122802734, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8535147905349731, "num_tokens": 730905315.0, "step": 19154 }, { "epoch": 2.4367128864012213, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.400291442871094, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8546565771102905, "num_tokens": 730947346.0, "step": 19155 }, { "epoch": 2.436840096679812, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.645320892333984, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8670827746391296, "num_tokens": 730976907.0, "step": 19156 }, { "epoch": 2.4369673069584024, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.09527587890625, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8597570061683655, "num_tokens": 731015940.0, "step": 19157 }, { "epoch": 2.437094517236993, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.9257698059082, "learning_rate": 1e-06, "loss": 0.5357, "mean_token_accuracy": 0.8817988634109497, "num_tokens": 731051345.0, "step": 19158 }, { "epoch": 2.4372217275155834, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.99261474609375, "learning_rate": 1e-06, "loss": 0.6897, "mean_token_accuracy": 0.830062747001648, "num_tokens": 731089642.0, "step": 19159 }, { "epoch": 2.437348937794174, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.801055908203125, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8813228011131287, "num_tokens": 731133196.0, "step": 19160 }, { "epoch": 2.4374761480727645, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.31597137451172, "learning_rate": 1e-06, "loss": 0.6435, "mean_token_accuracy": 0.8522599935531616, "num_tokens": 731175510.0, "step": 19161 }, { "epoch": 2.437603358351355, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.82203674316406, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.870781660079956, "num_tokens": 731217526.0, "step": 19162 }, { "epoch": 2.437730568629945, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.199485778808594, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8526858687400818, "num_tokens": 731259525.0, "step": 19163 }, { "epoch": 2.437857778908536, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.918617248535156, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8673362731933594, "num_tokens": 731301480.0, "step": 19164 }, { "epoch": 2.437984989187126, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.36123275756836, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.871901273727417, "num_tokens": 731347385.0, "step": 19165 }, { "epoch": 2.4381121994657167, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.992271423339844, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8677440881729126, "num_tokens": 731383666.0, "step": 19166 }, { "epoch": 2.438239409744307, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.586116790771484, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8650375604629517, "num_tokens": 731420297.0, "step": 19167 }, { "epoch": 2.4383666200228977, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.954158782958984, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.857836902141571, "num_tokens": 731455083.0, "step": 19168 }, { "epoch": 2.4384938303014883, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.705467224121094, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8724664449691772, "num_tokens": 731482555.0, "step": 19169 }, { "epoch": 2.438621040580079, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.58970260620117, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8763540983200073, "num_tokens": 731519459.0, "step": 19170 }, { "epoch": 2.4387482508586693, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.80333709716797, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8683425188064575, "num_tokens": 731554577.0, "step": 19171 }, { "epoch": 2.43887546113726, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.094730377197266, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8542855381965637, "num_tokens": 731594993.0, "step": 19172 }, { "epoch": 2.4390026714158504, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.39250564575195, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.868712306022644, "num_tokens": 731629182.0, "step": 19173 }, { "epoch": 2.439129881694441, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.29136657714844, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.861282467842102, "num_tokens": 731667661.0, "step": 19174 }, { "epoch": 2.4392570919730314, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.062767028808594, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8649356365203857, "num_tokens": 731704911.0, "step": 19175 }, { "epoch": 2.439384302251622, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.45547866821289, "learning_rate": 1e-06, "loss": 0.6538, "mean_token_accuracy": 0.846036434173584, "num_tokens": 731744990.0, "step": 19176 }, { "epoch": 2.4395115125302125, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.266178131103516, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8658923506736755, "num_tokens": 731783335.0, "step": 19177 }, { "epoch": 2.439638722808803, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.499202728271484, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8669042587280273, "num_tokens": 731819206.0, "step": 19178 }, { "epoch": 2.4397659330873935, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.017578125, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8594486117362976, "num_tokens": 731855291.0, "step": 19179 }, { "epoch": 2.439893143365984, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.282981872558594, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8669765591621399, "num_tokens": 731899392.0, "step": 19180 }, { "epoch": 2.4400203536445746, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.40966033935547, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.863860011100769, "num_tokens": 731937502.0, "step": 19181 }, { "epoch": 2.440147563923165, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.44017028808594, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8674975633621216, "num_tokens": 731974946.0, "step": 19182 }, { "epoch": 2.4402747742017556, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.92750549316406, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8509941101074219, "num_tokens": 732018732.0, "step": 19183 }, { "epoch": 2.440401984480346, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.25112533569336, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8628425598144531, "num_tokens": 732052036.0, "step": 19184 }, { "epoch": 2.4405291947589367, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.61541748046875, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8587693572044373, "num_tokens": 732090944.0, "step": 19185 }, { "epoch": 2.440656405037527, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.95709228515625, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8658370971679688, "num_tokens": 732131406.0, "step": 19186 }, { "epoch": 2.4407836153161178, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.27021408081055, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8627641201019287, "num_tokens": 732171701.0, "step": 19187 }, { "epoch": 2.440910825594708, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.52915573120117, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8734549283981323, "num_tokens": 732209658.0, "step": 19188 }, { "epoch": 2.4410380358732984, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.57789993286133, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8526053428649902, "num_tokens": 732251395.0, "step": 19189 }, { "epoch": 2.441165246151889, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.89170455932617, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.8525682091712952, "num_tokens": 732289306.0, "step": 19190 }, { "epoch": 2.4412924564304794, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.30886459350586, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8480136394500732, "num_tokens": 732337916.0, "step": 19191 }, { "epoch": 2.44141966670907, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.48567581176758, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8511302471160889, "num_tokens": 732380358.0, "step": 19192 }, { "epoch": 2.4415468769876605, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.04351806640625, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.873731255531311, "num_tokens": 732416534.0, "step": 19193 }, { "epoch": 2.441674087266251, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.21952819824219, "learning_rate": 1e-06, "loss": 0.659, "mean_token_accuracy": 0.842468798160553, "num_tokens": 732458927.0, "step": 19194 }, { "epoch": 2.4418012975448415, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.76118850708008, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8557122945785522, "num_tokens": 732501261.0, "step": 19195 }, { "epoch": 2.441928507823432, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.39171600341797, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8714252710342407, "num_tokens": 732541162.0, "step": 19196 }, { "epoch": 2.4420557181020226, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.650543212890625, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8613247275352478, "num_tokens": 732578237.0, "step": 19197 }, { "epoch": 2.442182928380613, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.53786849975586, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.851323127746582, "num_tokens": 732618624.0, "step": 19198 }, { "epoch": 2.4423101386592037, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.557518005371094, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8598926663398743, "num_tokens": 732661747.0, "step": 19199 }, { "epoch": 2.442437348937794, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.1478271484375, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8413337469100952, "num_tokens": 732703511.0, "step": 19200 }, { "epoch": 2.4425645592163847, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.759849548339844, "learning_rate": 1e-06, "loss": 0.6692, "mean_token_accuracy": 0.8433240652084351, "num_tokens": 732746749.0, "step": 19201 }, { "epoch": 2.4426917694949752, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 48.95054244995117, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8638428449630737, "num_tokens": 732783184.0, "step": 19202 }, { "epoch": 2.4428189797735658, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.07060623168945, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.8561518788337708, "num_tokens": 732818664.0, "step": 19203 }, { "epoch": 2.4429461900521563, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.99282455444336, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8536280393600464, "num_tokens": 732854286.0, "step": 19204 }, { "epoch": 2.443073400330747, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.0926513671875, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8690940141677856, "num_tokens": 732893032.0, "step": 19205 }, { "epoch": 2.4432006106093374, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.91536331176758, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8623615503311157, "num_tokens": 732929942.0, "step": 19206 }, { "epoch": 2.443327820887928, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.71935272216797, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.856553852558136, "num_tokens": 732968218.0, "step": 19207 }, { "epoch": 2.4434550311665184, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.16887664794922, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8870440721511841, "num_tokens": 733001464.0, "step": 19208 }, { "epoch": 2.443582241445109, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.9694709777832, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.8542012572288513, "num_tokens": 733036311.0, "step": 19209 }, { "epoch": 2.4437094517236995, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.495567321777344, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8699619770050049, "num_tokens": 733070487.0, "step": 19210 }, { "epoch": 2.4438366620022896, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.11918258666992, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8615474700927734, "num_tokens": 733110357.0, "step": 19211 }, { "epoch": 2.4439638722808805, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.85831069946289, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8544589281082153, "num_tokens": 733149070.0, "step": 19212 }, { "epoch": 2.4440910825594706, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.102237701416016, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8801966905593872, "num_tokens": 733180848.0, "step": 19213 }, { "epoch": 2.444218292838061, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.990169525146484, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8627710342407227, "num_tokens": 733219204.0, "step": 19214 }, { "epoch": 2.4443455031166517, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.129478454589844, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8526207208633423, "num_tokens": 733255237.0, "step": 19215 }, { "epoch": 2.444472713395242, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.784912109375, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8740415573120117, "num_tokens": 733293243.0, "step": 19216 }, { "epoch": 2.4445999236738327, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.30913162231445, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8665120601654053, "num_tokens": 733333530.0, "step": 19217 }, { "epoch": 2.4447271339524232, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.569007873535156, "learning_rate": 1e-06, "loss": 0.661, "mean_token_accuracy": 0.846215546131134, "num_tokens": 733373790.0, "step": 19218 }, { "epoch": 2.4448543442310138, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.71472930908203, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8641126751899719, "num_tokens": 733416037.0, "step": 19219 }, { "epoch": 2.4449815545096043, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.40243911743164, "learning_rate": 1e-06, "loss": 0.6088, "mean_token_accuracy": 0.8594111204147339, "num_tokens": 733453014.0, "step": 19220 }, { "epoch": 2.445108764788195, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.181121826171875, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8492927551269531, "num_tokens": 733491197.0, "step": 19221 }, { "epoch": 2.4452359750667854, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.186187744140625, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8767119646072388, "num_tokens": 733531142.0, "step": 19222 }, { "epoch": 2.445363185345376, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.707462310791016, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8560847043991089, "num_tokens": 733573916.0, "step": 19223 }, { "epoch": 2.4454903956239664, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.26572036743164, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8576215505599976, "num_tokens": 733613204.0, "step": 19224 }, { "epoch": 2.445617605902557, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.32222366333008, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8580670952796936, "num_tokens": 733653618.0, "step": 19225 }, { "epoch": 2.4457448161811475, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.66823959350586, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.879727303981781, "num_tokens": 733687795.0, "step": 19226 }, { "epoch": 2.445872026459738, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.58602523803711, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8623988628387451, "num_tokens": 733723574.0, "step": 19227 }, { "epoch": 2.4459992367383285, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.49247360229492, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8694693446159363, "num_tokens": 733762051.0, "step": 19228 }, { "epoch": 2.446126447016919, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.031070709228516, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8798061609268188, "num_tokens": 733800640.0, "step": 19229 }, { "epoch": 2.4462536572955096, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.359493255615234, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8518054485321045, "num_tokens": 733841321.0, "step": 19230 }, { "epoch": 2.4463808675741, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.77287673950195, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.86528080701828, "num_tokens": 733874104.0, "step": 19231 }, { "epoch": 2.4465080778526906, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.72309112548828, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8673689365386963, "num_tokens": 733913127.0, "step": 19232 }, { "epoch": 2.446635288131281, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.12407684326172, "learning_rate": 1e-06, "loss": 0.5449, "mean_token_accuracy": 0.8800572156906128, "num_tokens": 733951900.0, "step": 19233 }, { "epoch": 2.4467624984098717, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.05556106567383, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8649314045906067, "num_tokens": 733992866.0, "step": 19234 }, { "epoch": 2.4468897086884622, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.339454650878906, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.857261598110199, "num_tokens": 734029895.0, "step": 19235 }, { "epoch": 2.4470169189670523, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.3165283203125, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8783086538314819, "num_tokens": 734066954.0, "step": 19236 }, { "epoch": 2.4471441292456433, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.51571273803711, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8850526809692383, "num_tokens": 734102412.0, "step": 19237 }, { "epoch": 2.4472713395242334, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.3963623046875, "learning_rate": 1e-06, "loss": 0.547, "mean_token_accuracy": 0.8804831504821777, "num_tokens": 734141400.0, "step": 19238 }, { "epoch": 2.447398549802824, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.623756408691406, "learning_rate": 1e-06, "loss": 0.6777, "mean_token_accuracy": 0.8430297374725342, "num_tokens": 734176628.0, "step": 19239 }, { "epoch": 2.4475257600814144, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.14668655395508, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.868515133857727, "num_tokens": 734219756.0, "step": 19240 }, { "epoch": 2.447652970360005, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.34832763671875, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8615385293960571, "num_tokens": 734250509.0, "step": 19241 }, { "epoch": 2.4477801806385955, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.93951416015625, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8692025542259216, "num_tokens": 734287887.0, "step": 19242 }, { "epoch": 2.447907390917186, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.120094299316406, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8681957721710205, "num_tokens": 734330446.0, "step": 19243 }, { "epoch": 2.4480346011957765, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.193138122558594, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.870257556438446, "num_tokens": 734368427.0, "step": 19244 }, { "epoch": 2.448161811474367, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.35601806640625, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8816192150115967, "num_tokens": 734405103.0, "step": 19245 }, { "epoch": 2.4482890217529576, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.03534698486328, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8601151704788208, "num_tokens": 734448069.0, "step": 19246 }, { "epoch": 2.448416232031548, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.112972259521484, "learning_rate": 1e-06, "loss": 0.6473, "mean_token_accuracy": 0.8502376079559326, "num_tokens": 734486016.0, "step": 19247 }, { "epoch": 2.4485434423101387, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.4648551940918, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8513477444648743, "num_tokens": 734526943.0, "step": 19248 }, { "epoch": 2.448670652588729, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.70697021484375, "learning_rate": 1e-06, "loss": 0.5683, "mean_token_accuracy": 0.8740294575691223, "num_tokens": 734566137.0, "step": 19249 }, { "epoch": 2.4487978628673197, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.88964080810547, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8607226014137268, "num_tokens": 734602366.0, "step": 19250 }, { "epoch": 2.4489250731459102, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.55418014526367, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8719131946563721, "num_tokens": 734644251.0, "step": 19251 }, { "epoch": 2.4490522834245008, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.832210540771484, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8598546385765076, "num_tokens": 734682938.0, "step": 19252 }, { "epoch": 2.4491794937030913, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.88383865356445, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.864361047744751, "num_tokens": 734722975.0, "step": 19253 }, { "epoch": 2.449306703981682, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.5659294128418, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8663886785507202, "num_tokens": 734761361.0, "step": 19254 }, { "epoch": 2.4494339142602723, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.81936264038086, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8597652316093445, "num_tokens": 734800597.0, "step": 19255 }, { "epoch": 2.449561124538863, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.246707916259766, "learning_rate": 1e-06, "loss": 0.6369, "mean_token_accuracy": 0.8518545627593994, "num_tokens": 734840575.0, "step": 19256 }, { "epoch": 2.4496883348174534, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.17298126220703, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8542560935020447, "num_tokens": 734876943.0, "step": 19257 }, { "epoch": 2.449815545096044, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.480796813964844, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8796707391738892, "num_tokens": 734916436.0, "step": 19258 }, { "epoch": 2.4499427553746345, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.01972579956055, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8623896241188049, "num_tokens": 734956336.0, "step": 19259 }, { "epoch": 2.450069965653225, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.623783111572266, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.861266016960144, "num_tokens": 734997132.0, "step": 19260 }, { "epoch": 2.450197175931815, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.355838775634766, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8513122797012329, "num_tokens": 735036192.0, "step": 19261 }, { "epoch": 2.450324386210406, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.28717803955078, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8776205778121948, "num_tokens": 735079368.0, "step": 19262 }, { "epoch": 2.450451596488996, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.973899841308594, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8696168661117554, "num_tokens": 735109714.0, "step": 19263 }, { "epoch": 2.4505788067675867, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.46294021606445, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8602883815765381, "num_tokens": 735152712.0, "step": 19264 }, { "epoch": 2.450706017046177, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.28838348388672, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8583102226257324, "num_tokens": 735193313.0, "step": 19265 }, { "epoch": 2.4508332273247677, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.86298751831055, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8778938055038452, "num_tokens": 735235413.0, "step": 19266 }, { "epoch": 2.4509604376033582, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.875282287597656, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8635683059692383, "num_tokens": 735270809.0, "step": 19267 }, { "epoch": 2.4510876478819488, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.94578170776367, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.854925274848938, "num_tokens": 735309790.0, "step": 19268 }, { "epoch": 2.4512148581605393, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.16466522216797, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8643808364868164, "num_tokens": 735341084.0, "step": 19269 }, { "epoch": 2.45134206843913, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.643497467041016, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.879327118396759, "num_tokens": 735376964.0, "step": 19270 }, { "epoch": 2.4514692787177204, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.19415283203125, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.873924970626831, "num_tokens": 735413296.0, "step": 19271 }, { "epoch": 2.451596488996311, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.82904052734375, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.868339478969574, "num_tokens": 735454992.0, "step": 19272 }, { "epoch": 2.4517236992749014, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.07242965698242, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8651586174964905, "num_tokens": 735493315.0, "step": 19273 }, { "epoch": 2.451850909553492, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.25151824951172, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8731650114059448, "num_tokens": 735532666.0, "step": 19274 }, { "epoch": 2.4519781198320825, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.27254104614258, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8605446815490723, "num_tokens": 735571987.0, "step": 19275 }, { "epoch": 2.452105330110673, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.79281234741211, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8607200384140015, "num_tokens": 735609409.0, "step": 19276 }, { "epoch": 2.4522325403892635, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.3297004699707, "learning_rate": 1e-06, "loss": 0.6844, "mean_token_accuracy": 0.8365753889083862, "num_tokens": 735647299.0, "step": 19277 }, { "epoch": 2.452359750667854, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.18117904663086, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8699305057525635, "num_tokens": 735677757.0, "step": 19278 }, { "epoch": 2.4524869609464446, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.57337951660156, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8690398931503296, "num_tokens": 735723924.0, "step": 19279 }, { "epoch": 2.452614171225035, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.784034729003906, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8476895689964294, "num_tokens": 735758105.0, "step": 19280 }, { "epoch": 2.4527413815036256, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.10884475708008, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.860265851020813, "num_tokens": 735797603.0, "step": 19281 }, { "epoch": 2.452868591782216, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.66865539550781, "learning_rate": 1e-06, "loss": 0.6891, "mean_token_accuracy": 0.8360607624053955, "num_tokens": 735838958.0, "step": 19282 }, { "epoch": 2.4529958020608067, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.84455490112305, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8552502393722534, "num_tokens": 735878894.0, "step": 19283 }, { "epoch": 2.453123012339397, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.19887924194336, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8937325477600098, "num_tokens": 735913222.0, "step": 19284 }, { "epoch": 2.4532502226179878, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.47428512573242, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8713182210922241, "num_tokens": 735951884.0, "step": 19285 }, { "epoch": 2.453377432896578, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.130821228027344, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8579528331756592, "num_tokens": 735998955.0, "step": 19286 }, { "epoch": 2.4535046431751684, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.71365737915039, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.862098753452301, "num_tokens": 736039631.0, "step": 19287 }, { "epoch": 2.453631853453759, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.864288330078125, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8594269752502441, "num_tokens": 736085100.0, "step": 19288 }, { "epoch": 2.4537590637323494, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.823944091796875, "learning_rate": 1e-06, "loss": 0.5574, "mean_token_accuracy": 0.8781532049179077, "num_tokens": 736119680.0, "step": 19289 }, { "epoch": 2.45388627401094, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.9649772644043, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8742313981056213, "num_tokens": 736159769.0, "step": 19290 }, { "epoch": 2.4540134842895305, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.95518112182617, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8709340691566467, "num_tokens": 736199566.0, "step": 19291 }, { "epoch": 2.454140694568121, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.95698928833008, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8686609268188477, "num_tokens": 736239238.0, "step": 19292 }, { "epoch": 2.4542679048467115, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.75758743286133, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8682569265365601, "num_tokens": 736274576.0, "step": 19293 }, { "epoch": 2.454395115125302, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.96792221069336, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8551619648933411, "num_tokens": 736307509.0, "step": 19294 }, { "epoch": 2.4545223254038926, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.40618133544922, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.860499382019043, "num_tokens": 736348313.0, "step": 19295 }, { "epoch": 2.454649535682483, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.288272857666016, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8592724800109863, "num_tokens": 736380716.0, "step": 19296 }, { "epoch": 2.4547767459610736, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.68865966796875, "learning_rate": 1e-06, "loss": 0.6758, "mean_token_accuracy": 0.8423106670379639, "num_tokens": 736422669.0, "step": 19297 }, { "epoch": 2.454903956239664, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.831016540527344, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8706101179122925, "num_tokens": 736452929.0, "step": 19298 }, { "epoch": 2.4550311665182547, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.60490417480469, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8652490377426147, "num_tokens": 736486755.0, "step": 19299 }, { "epoch": 2.4551583767968452, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.01228713989258, "learning_rate": 1e-06, "loss": 0.5692, "mean_token_accuracy": 0.8732657432556152, "num_tokens": 736521542.0, "step": 19300 }, { "epoch": 2.4552855870754358, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.509979248046875, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8640182018280029, "num_tokens": 736557312.0, "step": 19301 }, { "epoch": 2.4554127973540263, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.349700927734375, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8602883815765381, "num_tokens": 736596469.0, "step": 19302 }, { "epoch": 2.455540007632617, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.38920211791992, "learning_rate": 1e-06, "loss": 0.6528, "mean_token_accuracy": 0.8473634719848633, "num_tokens": 736636078.0, "step": 19303 }, { "epoch": 2.4556672179112073, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.839847564697266, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8764759302139282, "num_tokens": 736674117.0, "step": 19304 }, { "epoch": 2.455794428189798, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.09418487548828, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8593382835388184, "num_tokens": 736709116.0, "step": 19305 }, { "epoch": 2.4559216384683884, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.086063385009766, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8553500175476074, "num_tokens": 736750383.0, "step": 19306 }, { "epoch": 2.456048848746979, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.38551330566406, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8518931865692139, "num_tokens": 736790836.0, "step": 19307 }, { "epoch": 2.4561760590255695, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.75514221191406, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8599918484687805, "num_tokens": 736826085.0, "step": 19308 }, { "epoch": 2.4563032693041595, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.35139465332031, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.862837016582489, "num_tokens": 736860599.0, "step": 19309 }, { "epoch": 2.4564304795827505, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.072566986083984, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8815435767173767, "num_tokens": 736893286.0, "step": 19310 }, { "epoch": 2.4565576898613406, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.813594818115234, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8726197481155396, "num_tokens": 736931597.0, "step": 19311 }, { "epoch": 2.456684900139931, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.86211395263672, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8716161847114563, "num_tokens": 736972183.0, "step": 19312 }, { "epoch": 2.4568121104185217, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.27019500732422, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.870501697063446, "num_tokens": 737004184.0, "step": 19313 }, { "epoch": 2.456939320697112, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.596954345703125, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8635153770446777, "num_tokens": 737042644.0, "step": 19314 }, { "epoch": 2.4570665309757027, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.504825592041016, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8525789380073547, "num_tokens": 737074830.0, "step": 19315 }, { "epoch": 2.4571937412542932, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.802005767822266, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8707120418548584, "num_tokens": 737107756.0, "step": 19316 }, { "epoch": 2.4573209515328838, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.261356353759766, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8725497722625732, "num_tokens": 737143457.0, "step": 19317 }, { "epoch": 2.4574481618114743, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.52456283569336, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8590697050094604, "num_tokens": 737177005.0, "step": 19318 }, { "epoch": 2.457575372090065, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.46467590332031, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8548251390457153, "num_tokens": 737218265.0, "step": 19319 }, { "epoch": 2.4577025823686554, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.79365539550781, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8520298600196838, "num_tokens": 737259495.0, "step": 19320 }, { "epoch": 2.457829792647246, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.32611846923828, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8561847805976868, "num_tokens": 737300073.0, "step": 19321 }, { "epoch": 2.4579570029258364, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.40549087524414, "learning_rate": 1e-06, "loss": 0.5889, "mean_token_accuracy": 0.8647270202636719, "num_tokens": 737334716.0, "step": 19322 }, { "epoch": 2.458084213204427, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.76821517944336, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8693916201591492, "num_tokens": 737377086.0, "step": 19323 }, { "epoch": 2.4582114234830175, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.187965393066406, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.853431224822998, "num_tokens": 737418259.0, "step": 19324 }, { "epoch": 2.458338633761608, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.01950454711914, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8530846834182739, "num_tokens": 737453150.0, "step": 19325 }, { "epoch": 2.4584658440401985, "ewc_loss": 0.193359375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001697540283203125, "grad_norm": 49.056270599365234, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8785107731819153, "num_tokens": 737495552.0, "step": 19326 }, { "epoch": 2.458593054318789, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.39149475097656, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8677905797958374, "num_tokens": 737537518.0, "step": 19327 }, { "epoch": 2.4587202645973796, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.74781799316406, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8587552309036255, "num_tokens": 737577472.0, "step": 19328 }, { "epoch": 2.45884747487597, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.847412109375, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8545897006988525, "num_tokens": 737617250.0, "step": 19329 }, { "epoch": 2.4589746851545606, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.260501861572266, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8678600788116455, "num_tokens": 737655544.0, "step": 19330 }, { "epoch": 2.459101895433151, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.54996109008789, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8672673106193542, "num_tokens": 737688644.0, "step": 19331 }, { "epoch": 2.4592291057117417, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.18246078491211, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8672765493392944, "num_tokens": 737730264.0, "step": 19332 }, { "epoch": 2.459356315990332, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.15810775756836, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.857848048210144, "num_tokens": 737763578.0, "step": 19333 }, { "epoch": 2.4594835262689223, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.42078399658203, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.865146279335022, "num_tokens": 737804062.0, "step": 19334 }, { "epoch": 2.4596107365475133, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.14006423950195, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8772507309913635, "num_tokens": 737845608.0, "step": 19335 }, { "epoch": 2.4597379468261034, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.96648025512695, "learning_rate": 1e-06, "loss": 0.6657, "mean_token_accuracy": 0.8453280925750732, "num_tokens": 737881366.0, "step": 19336 }, { "epoch": 2.459865157104694, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.40263366699219, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.855635404586792, "num_tokens": 737922595.0, "step": 19337 }, { "epoch": 2.4599923673832844, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.8866081237793, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.870338499546051, "num_tokens": 737956368.0, "step": 19338 }, { "epoch": 2.460119577661875, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.686153411865234, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8688560128211975, "num_tokens": 738000167.0, "step": 19339 }, { "epoch": 2.4602467879404655, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.805362701416016, "learning_rate": 1e-06, "loss": 0.6439, "mean_token_accuracy": 0.8557589054107666, "num_tokens": 738040850.0, "step": 19340 }, { "epoch": 2.460373998219056, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.03461456298828, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8615256547927856, "num_tokens": 738080370.0, "step": 19341 }, { "epoch": 2.4605012084976465, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.75563049316406, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8688188791275024, "num_tokens": 738116751.0, "step": 19342 }, { "epoch": 2.460628418776237, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.10758972167969, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8817782402038574, "num_tokens": 738154597.0, "step": 19343 }, { "epoch": 2.4607556290548276, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.364566802978516, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8714000582695007, "num_tokens": 738186484.0, "step": 19344 }, { "epoch": 2.460882839333418, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.26576232910156, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8665204644203186, "num_tokens": 738217722.0, "step": 19345 }, { "epoch": 2.4610100496120086, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.05006408691406, "learning_rate": 1e-06, "loss": 0.558, "mean_token_accuracy": 0.8768894672393799, "num_tokens": 738255304.0, "step": 19346 }, { "epoch": 2.461137259890599, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.54642868041992, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8603124618530273, "num_tokens": 738292162.0, "step": 19347 }, { "epoch": 2.4612644701691897, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.14823913574219, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8766969442367554, "num_tokens": 738332851.0, "step": 19348 }, { "epoch": 2.4613916804477802, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.053382873535156, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8744466304779053, "num_tokens": 738375120.0, "step": 19349 }, { "epoch": 2.4615188907263708, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.303306579589844, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.855014443397522, "num_tokens": 738413857.0, "step": 19350 }, { "epoch": 2.4616461010049613, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.746097564697266, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8574488162994385, "num_tokens": 738450097.0, "step": 19351 }, { "epoch": 2.461773311283552, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.79296875, "learning_rate": 1e-06, "loss": 0.6723, "mean_token_accuracy": 0.8464754819869995, "num_tokens": 738483784.0, "step": 19352 }, { "epoch": 2.4619005215621423, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.941951751708984, "learning_rate": 1e-06, "loss": 0.678, "mean_token_accuracy": 0.8443124294281006, "num_tokens": 738526820.0, "step": 19353 }, { "epoch": 2.462027731840733, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.60670852661133, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8491070866584778, "num_tokens": 738569049.0, "step": 19354 }, { "epoch": 2.4621549421193234, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.7721061706543, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8598647117614746, "num_tokens": 738608320.0, "step": 19355 }, { "epoch": 2.462282152397914, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.921661376953125, "learning_rate": 1e-06, "loss": 0.5562, "mean_token_accuracy": 0.8786922693252563, "num_tokens": 738645566.0, "step": 19356 }, { "epoch": 2.4624093626765045, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.653690338134766, "learning_rate": 1e-06, "loss": 0.6849, "mean_token_accuracy": 0.8403502702713013, "num_tokens": 738686184.0, "step": 19357 }, { "epoch": 2.462536572955095, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.10742950439453, "learning_rate": 1e-06, "loss": 0.6448, "mean_token_accuracy": 0.8484089374542236, "num_tokens": 738728820.0, "step": 19358 }, { "epoch": 2.462663783233685, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.00447082519531, "learning_rate": 1e-06, "loss": 0.7025, "mean_token_accuracy": 0.8327904939651489, "num_tokens": 738760180.0, "step": 19359 }, { "epoch": 2.462790993512276, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.570343017578125, "learning_rate": 1e-06, "loss": 0.5684, "mean_token_accuracy": 0.8705984950065613, "num_tokens": 738795501.0, "step": 19360 }, { "epoch": 2.462918203790866, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.4197998046875, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.8533572554588318, "num_tokens": 738831027.0, "step": 19361 }, { "epoch": 2.4630454140694567, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.63916778564453, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8452211618423462, "num_tokens": 738870383.0, "step": 19362 }, { "epoch": 2.463172624348047, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.72224426269531, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8690599799156189, "num_tokens": 738908231.0, "step": 19363 }, { "epoch": 2.4632998346266377, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.07687759399414, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8548191785812378, "num_tokens": 738950927.0, "step": 19364 }, { "epoch": 2.4634270449052282, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.18042755126953, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8750380277633667, "num_tokens": 738989857.0, "step": 19365 }, { "epoch": 2.4635542551838188, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.957740783691406, "learning_rate": 1e-06, "loss": 0.5636, "mean_token_accuracy": 0.8760672211647034, "num_tokens": 739021281.0, "step": 19366 }, { "epoch": 2.4636814654624093, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.57341384887695, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8693587183952332, "num_tokens": 739063475.0, "step": 19367 }, { "epoch": 2.463808675741, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.27225112915039, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8598144054412842, "num_tokens": 739107083.0, "step": 19368 }, { "epoch": 2.4639358860195903, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.781795501708984, "learning_rate": 1e-06, "loss": 0.6705, "mean_token_accuracy": 0.8411680459976196, "num_tokens": 739152002.0, "step": 19369 }, { "epoch": 2.464063096298181, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.40019989013672, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.860613226890564, "num_tokens": 739191781.0, "step": 19370 }, { "epoch": 2.4641903065767714, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.71506118774414, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8852139711380005, "num_tokens": 739230374.0, "step": 19371 }, { "epoch": 2.464317516855362, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.50155258178711, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8570234775543213, "num_tokens": 739267310.0, "step": 19372 }, { "epoch": 2.4644447271339525, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.816410064697266, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8659349679946899, "num_tokens": 739305074.0, "step": 19373 }, { "epoch": 2.464571937412543, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.25429916381836, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8540005683898926, "num_tokens": 739343028.0, "step": 19374 }, { "epoch": 2.4646991476911335, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.95682907104492, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8614957332611084, "num_tokens": 739384364.0, "step": 19375 }, { "epoch": 2.464826357969724, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.463661193847656, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8793502449989319, "num_tokens": 739423564.0, "step": 19376 }, { "epoch": 2.4649535682483146, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.77695083618164, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8722133040428162, "num_tokens": 739457609.0, "step": 19377 }, { "epoch": 2.465080778526905, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.34272766113281, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8644249439239502, "num_tokens": 739497204.0, "step": 19378 }, { "epoch": 2.4652079888054956, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.11520767211914, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8584979772567749, "num_tokens": 739543688.0, "step": 19379 }, { "epoch": 2.465335199084086, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.34837341308594, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.875289261341095, "num_tokens": 739578221.0, "step": 19380 }, { "epoch": 2.4654624093626767, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.514163970947266, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8686392903327942, "num_tokens": 739615892.0, "step": 19381 }, { "epoch": 2.4655896196412668, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.304054260253906, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8509042263031006, "num_tokens": 739657697.0, "step": 19382 }, { "epoch": 2.4657168299198577, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.46479034423828, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8734674453735352, "num_tokens": 739694173.0, "step": 19383 }, { "epoch": 2.465844040198448, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.79072570800781, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8601374626159668, "num_tokens": 739734111.0, "step": 19384 }, { "epoch": 2.4659712504770384, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.68244171142578, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8640831708908081, "num_tokens": 739771409.0, "step": 19385 }, { "epoch": 2.466098460755629, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 48.8187370300293, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8503021597862244, "num_tokens": 739807185.0, "step": 19386 }, { "epoch": 2.4662256710342194, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.903533935546875, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8734778165817261, "num_tokens": 739837750.0, "step": 19387 }, { "epoch": 2.46635288131281, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.494598388671875, "learning_rate": 1e-06, "loss": 0.6078, "mean_token_accuracy": 0.8622198104858398, "num_tokens": 739879516.0, "step": 19388 }, { "epoch": 2.4664800915914005, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.813209533691406, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8696343898773193, "num_tokens": 739918688.0, "step": 19389 }, { "epoch": 2.466607301869991, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.17070770263672, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8675387501716614, "num_tokens": 739958978.0, "step": 19390 }, { "epoch": 2.4667345121485815, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.81884002685547, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8674585819244385, "num_tokens": 740002428.0, "step": 19391 }, { "epoch": 2.466861722427172, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.55876922607422, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8840457201004028, "num_tokens": 740045856.0, "step": 19392 }, { "epoch": 2.4669889327057626, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.048770904541016, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8611178398132324, "num_tokens": 740084512.0, "step": 19393 }, { "epoch": 2.467116142984353, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.261444091796875, "learning_rate": 1e-06, "loss": 0.5619, "mean_token_accuracy": 0.875483512878418, "num_tokens": 740129624.0, "step": 19394 }, { "epoch": 2.4672433532629436, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.718379974365234, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8465869426727295, "num_tokens": 740168368.0, "step": 19395 }, { "epoch": 2.467370563541534, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.294166564941406, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.8572056293487549, "num_tokens": 740206670.0, "step": 19396 }, { "epoch": 2.4674977738201247, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.91046142578125, "learning_rate": 1e-06, "loss": 0.6698, "mean_token_accuracy": 0.8410863876342773, "num_tokens": 740246422.0, "step": 19397 }, { "epoch": 2.4676249840987152, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.518802642822266, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8606662750244141, "num_tokens": 740284450.0, "step": 19398 }, { "epoch": 2.4677521943773058, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.22395324707031, "learning_rate": 1e-06, "loss": 0.6719, "mean_token_accuracy": 0.8392537832260132, "num_tokens": 740319062.0, "step": 19399 }, { "epoch": 2.4678794046558963, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.57262420654297, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8627268671989441, "num_tokens": 740353996.0, "step": 19400 }, { "epoch": 2.468006614934487, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.20945739746094, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8631080389022827, "num_tokens": 740394694.0, "step": 19401 }, { "epoch": 2.4681338252130773, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.80708694458008, "learning_rate": 1e-06, "loss": 0.5374, "mean_token_accuracy": 0.8851228952407837, "num_tokens": 740436567.0, "step": 19402 }, { "epoch": 2.468261035491668, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.76285171508789, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8536533117294312, "num_tokens": 740476458.0, "step": 19403 }, { "epoch": 2.4683882457702584, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.815582275390625, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8725139498710632, "num_tokens": 740514703.0, "step": 19404 }, { "epoch": 2.468515456048849, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.03107833862305, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8743968605995178, "num_tokens": 740552328.0, "step": 19405 }, { "epoch": 2.4686426663274394, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.87726974487305, "learning_rate": 1e-06, "loss": 0.657, "mean_token_accuracy": 0.8454384803771973, "num_tokens": 740593141.0, "step": 19406 }, { "epoch": 2.4687698766060295, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.03569793701172, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8732178807258606, "num_tokens": 740634097.0, "step": 19407 }, { "epoch": 2.4688970868846205, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.79578399658203, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8686860799789429, "num_tokens": 740672917.0, "step": 19408 }, { "epoch": 2.4690242971632106, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.04215621948242, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8575543165206909, "num_tokens": 740709351.0, "step": 19409 }, { "epoch": 2.469151507441801, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.913631439208984, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.8535887002944946, "num_tokens": 740744049.0, "step": 19410 }, { "epoch": 2.4692787177203916, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.95949935913086, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8661256432533264, "num_tokens": 740784765.0, "step": 19411 }, { "epoch": 2.469405927998982, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.662681579589844, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8552124500274658, "num_tokens": 740815859.0, "step": 19412 }, { "epoch": 2.4695331382775727, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.07794952392578, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8820415139198303, "num_tokens": 740854300.0, "step": 19413 }, { "epoch": 2.4696603485561632, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.839691162109375, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8472884297370911, "num_tokens": 740891836.0, "step": 19414 }, { "epoch": 2.4697875588347538, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.670223236083984, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8718421459197998, "num_tokens": 740927434.0, "step": 19415 }, { "epoch": 2.4699147691133443, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.897239685058594, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8657477498054504, "num_tokens": 740973707.0, "step": 19416 }, { "epoch": 2.470041979391935, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.82390213012695, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8703917860984802, "num_tokens": 741015953.0, "step": 19417 }, { "epoch": 2.4701691896705253, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.94289016723633, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8714052438735962, "num_tokens": 741049634.0, "step": 19418 }, { "epoch": 2.470296399949116, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.03708267211914, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8725517988204956, "num_tokens": 741084267.0, "step": 19419 }, { "epoch": 2.4704236102277064, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.49801254272461, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8715163469314575, "num_tokens": 741122237.0, "step": 19420 }, { "epoch": 2.470550820506297, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.24569320678711, "learning_rate": 1e-06, "loss": 0.6483, "mean_token_accuracy": 0.8507725596427917, "num_tokens": 741163593.0, "step": 19421 }, { "epoch": 2.4706780307848875, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.30679702758789, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8544682264328003, "num_tokens": 741202328.0, "step": 19422 }, { "epoch": 2.470805241063478, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.499549865722656, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8826128244400024, "num_tokens": 741240095.0, "step": 19423 }, { "epoch": 2.4709324513420685, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.553768157958984, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8654075860977173, "num_tokens": 741278509.0, "step": 19424 }, { "epoch": 2.471059661620659, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.087249755859375, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8571988940238953, "num_tokens": 741321446.0, "step": 19425 }, { "epoch": 2.4711868718992496, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.97522735595703, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8603285551071167, "num_tokens": 741360469.0, "step": 19426 }, { "epoch": 2.47131408217784, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.249053955078125, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8512963652610779, "num_tokens": 741393833.0, "step": 19427 }, { "epoch": 2.4714412924564306, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.90936279296875, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.867051899433136, "num_tokens": 741426956.0, "step": 19428 }, { "epoch": 2.471568502735021, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.28800582885742, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8634630441665649, "num_tokens": 741464820.0, "step": 19429 }, { "epoch": 2.4716957130136117, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.21934127807617, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8716890811920166, "num_tokens": 741499519.0, "step": 19430 }, { "epoch": 2.471822923292202, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.498802185058594, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.869164764881134, "num_tokens": 741533443.0, "step": 19431 }, { "epoch": 2.4719501335707923, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.394466400146484, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.860582709312439, "num_tokens": 741576849.0, "step": 19432 }, { "epoch": 2.4720773438493833, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.53312301635742, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8684418201446533, "num_tokens": 741614653.0, "step": 19433 }, { "epoch": 2.4722045541279734, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.634151458740234, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8588101863861084, "num_tokens": 741649354.0, "step": 19434 }, { "epoch": 2.472331764406564, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.367916107177734, "learning_rate": 1e-06, "loss": 0.631, "mean_token_accuracy": 0.8576509952545166, "num_tokens": 741692257.0, "step": 19435 }, { "epoch": 2.4724589746851544, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.64767837524414, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8580492734909058, "num_tokens": 741729739.0, "step": 19436 }, { "epoch": 2.472586184963745, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.426025390625, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8563705086708069, "num_tokens": 741763409.0, "step": 19437 }, { "epoch": 2.4727133952423355, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.45686340332031, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8588355183601379, "num_tokens": 741798746.0, "step": 19438 }, { "epoch": 2.472840605520926, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.21355438232422, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.871401846408844, "num_tokens": 741840488.0, "step": 19439 }, { "epoch": 2.4729678157995165, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.3937873840332, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8814191818237305, "num_tokens": 741880839.0, "step": 19440 }, { "epoch": 2.473095026078107, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.73314666748047, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8574796915054321, "num_tokens": 741919317.0, "step": 19441 }, { "epoch": 2.4732222363566976, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.71739196777344, "learning_rate": 1e-06, "loss": 0.6691, "mean_token_accuracy": 0.841401219367981, "num_tokens": 741955782.0, "step": 19442 }, { "epoch": 2.473349446635288, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.566532135009766, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8566679358482361, "num_tokens": 741990410.0, "step": 19443 }, { "epoch": 2.4734766569138786, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.85811996459961, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.8582925796508789, "num_tokens": 742031536.0, "step": 19444 }, { "epoch": 2.473603867192469, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.728389739990234, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8516843318939209, "num_tokens": 742067222.0, "step": 19445 }, { "epoch": 2.4737310774710597, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.81239700317383, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8749017715454102, "num_tokens": 742114740.0, "step": 19446 }, { "epoch": 2.47385828774965, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.098697662353516, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.868269681930542, "num_tokens": 742149344.0, "step": 19447 }, { "epoch": 2.4739854980282407, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.14452362060547, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8647871017456055, "num_tokens": 742191089.0, "step": 19448 }, { "epoch": 2.4741127083068313, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.57174301147461, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8624622821807861, "num_tokens": 742224944.0, "step": 19449 }, { "epoch": 2.474239918585422, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.41017150878906, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8528715968132019, "num_tokens": 742263674.0, "step": 19450 }, { "epoch": 2.4743671288640123, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.237892150878906, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8585959672927856, "num_tokens": 742300424.0, "step": 19451 }, { "epoch": 2.474494339142603, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.52645492553711, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8743649125099182, "num_tokens": 742333854.0, "step": 19452 }, { "epoch": 2.4746215494211934, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.883026123046875, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8714780807495117, "num_tokens": 742377126.0, "step": 19453 }, { "epoch": 2.474748759699784, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.88001251220703, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.86061692237854, "num_tokens": 742421284.0, "step": 19454 }, { "epoch": 2.4748759699783744, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.567665100097656, "learning_rate": 1e-06, "loss": 0.6703, "mean_token_accuracy": 0.8455064296722412, "num_tokens": 742459124.0, "step": 19455 }, { "epoch": 2.475003180256965, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.87248992919922, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.855560839176178, "num_tokens": 742493137.0, "step": 19456 }, { "epoch": 2.475130390535555, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.62620162963867, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8652560710906982, "num_tokens": 742529805.0, "step": 19457 }, { "epoch": 2.475257600814146, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.03096389770508, "learning_rate": 1e-06, "loss": 0.5615, "mean_token_accuracy": 0.8768312335014343, "num_tokens": 742560859.0, "step": 19458 }, { "epoch": 2.475384811092736, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.38300323486328, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8751747012138367, "num_tokens": 742601465.0, "step": 19459 }, { "epoch": 2.4755120213713266, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.37271499633789, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8566594123840332, "num_tokens": 742639242.0, "step": 19460 }, { "epoch": 2.475639231649917, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.37146759033203, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8623795509338379, "num_tokens": 742676574.0, "step": 19461 }, { "epoch": 2.4757664419285077, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.223636627197266, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8534170389175415, "num_tokens": 742713326.0, "step": 19462 }, { "epoch": 2.4758936522070982, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.177791595458984, "learning_rate": 1e-06, "loss": 0.6663, "mean_token_accuracy": 0.8406704664230347, "num_tokens": 742756883.0, "step": 19463 }, { "epoch": 2.4760208624856888, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.894107818603516, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.858990490436554, "num_tokens": 742794301.0, "step": 19464 }, { "epoch": 2.4761480727642793, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.86241912841797, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8665992021560669, "num_tokens": 742832966.0, "step": 19465 }, { "epoch": 2.47627528304287, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.36184310913086, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8599100112915039, "num_tokens": 742872926.0, "step": 19466 }, { "epoch": 2.4764024933214603, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.028316497802734, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8695238828659058, "num_tokens": 742910838.0, "step": 19467 }, { "epoch": 2.476529703600051, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.324127197265625, "learning_rate": 1e-06, "loss": 0.5726, "mean_token_accuracy": 0.8696153163909912, "num_tokens": 742950098.0, "step": 19468 }, { "epoch": 2.4766569138786414, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.825042724609375, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8741919994354248, "num_tokens": 742983872.0, "step": 19469 }, { "epoch": 2.476784124157232, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.87054443359375, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8521803617477417, "num_tokens": 743029816.0, "step": 19470 }, { "epoch": 2.4769113344358225, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.384071350097656, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8546508550643921, "num_tokens": 743068565.0, "step": 19471 }, { "epoch": 2.477038544714413, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.95541763305664, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8648179769515991, "num_tokens": 743112137.0, "step": 19472 }, { "epoch": 2.4771657549930035, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.37166976928711, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8778648376464844, "num_tokens": 743145067.0, "step": 19473 }, { "epoch": 2.477292965271594, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.67234802246094, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8691040277481079, "num_tokens": 743177017.0, "step": 19474 }, { "epoch": 2.4774201755501846, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.020965576171875, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.862703800201416, "num_tokens": 743218121.0, "step": 19475 }, { "epoch": 2.477547385828775, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.96772003173828, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.868646502494812, "num_tokens": 743256635.0, "step": 19476 }, { "epoch": 2.4776745961073656, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.10980224609375, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8606550097465515, "num_tokens": 743293801.0, "step": 19477 }, { "epoch": 2.477801806385956, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.418128967285156, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8779767751693726, "num_tokens": 743331582.0, "step": 19478 }, { "epoch": 2.4779290166645467, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.08499526977539, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8649412393569946, "num_tokens": 743369901.0, "step": 19479 }, { "epoch": 2.4780562269431368, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.39139938354492, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.858046293258667, "num_tokens": 743405808.0, "step": 19480 }, { "epoch": 2.4781834372217277, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.32657241821289, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8659842014312744, "num_tokens": 743447236.0, "step": 19481 }, { "epoch": 2.478310647500318, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.45384216308594, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8690736293792725, "num_tokens": 743486494.0, "step": 19482 }, { "epoch": 2.4784378577789083, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.31201171875, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8602408170700073, "num_tokens": 743523941.0, "step": 19483 }, { "epoch": 2.478565068057499, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.331642150878906, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8673375844955444, "num_tokens": 743554169.0, "step": 19484 }, { "epoch": 2.4786922783360894, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.01194763183594, "learning_rate": 1e-06, "loss": 0.6585, "mean_token_accuracy": 0.8468130230903625, "num_tokens": 743594656.0, "step": 19485 }, { "epoch": 2.47881948861468, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.816707611083984, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8704736828804016, "num_tokens": 743628638.0, "step": 19486 }, { "epoch": 2.4789466988932705, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.87751770019531, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8549380898475647, "num_tokens": 743665382.0, "step": 19487 }, { "epoch": 2.479073909171861, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.995582580566406, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8766194581985474, "num_tokens": 743701712.0, "step": 19488 }, { "epoch": 2.4792011194504515, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.041927337646484, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8631223440170288, "num_tokens": 743740347.0, "step": 19489 }, { "epoch": 2.479328329729042, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.9925422668457, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8641955852508545, "num_tokens": 743775434.0, "step": 19490 }, { "epoch": 2.4794555400076326, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.56333541870117, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8617417216300964, "num_tokens": 743812475.0, "step": 19491 }, { "epoch": 2.479582750286223, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.3695068359375, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8537178635597229, "num_tokens": 743851350.0, "step": 19492 }, { "epoch": 2.4797099605648136, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.148651123046875, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8584578037261963, "num_tokens": 743885086.0, "step": 19493 }, { "epoch": 2.479837170843404, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.41105270385742, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8667363524436951, "num_tokens": 743931747.0, "step": 19494 }, { "epoch": 2.4799643811219947, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.40768814086914, "learning_rate": 1e-06, "loss": 0.5715, "mean_token_accuracy": 0.8775548934936523, "num_tokens": 743971806.0, "step": 19495 }, { "epoch": 2.480091591400585, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.07907485961914, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8742294311523438, "num_tokens": 744005304.0, "step": 19496 }, { "epoch": 2.4802188016791757, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.13097381591797, "learning_rate": 1e-06, "loss": 0.6315, "mean_token_accuracy": 0.8541823625564575, "num_tokens": 744051159.0, "step": 19497 }, { "epoch": 2.4803460119577663, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.52212905883789, "learning_rate": 1e-06, "loss": 0.6551, "mean_token_accuracy": 0.8468789458274841, "num_tokens": 744089856.0, "step": 19498 }, { "epoch": 2.480473222236357, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.61955642700195, "learning_rate": 1e-06, "loss": 0.6784, "mean_token_accuracy": 0.8425418734550476, "num_tokens": 744136355.0, "step": 19499 }, { "epoch": 2.4806004325149473, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.929927825927734, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8524665832519531, "num_tokens": 744177682.0, "step": 19500 }, { "epoch": 2.480727642793538, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.93120574951172, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8646544218063354, "num_tokens": 744214209.0, "step": 19501 }, { "epoch": 2.4808548530721284, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.996334075927734, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8458667397499084, "num_tokens": 744257923.0, "step": 19502 }, { "epoch": 2.480982063350719, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.37996292114258, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8692238330841064, "num_tokens": 744293370.0, "step": 19503 }, { "epoch": 2.4811092736293094, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 48.91461181640625, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8701378107070923, "num_tokens": 744328572.0, "step": 19504 }, { "epoch": 2.4812364839078995, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.584815979003906, "learning_rate": 1e-06, "loss": 0.6582, "mean_token_accuracy": 0.8457657098770142, "num_tokens": 744364124.0, "step": 19505 }, { "epoch": 2.4813636941864905, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.51116180419922, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8662127256393433, "num_tokens": 744403679.0, "step": 19506 }, { "epoch": 2.4814909044650806, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.17432403564453, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.862764835357666, "num_tokens": 744440922.0, "step": 19507 }, { "epoch": 2.481618114743671, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.416175842285156, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8597849607467651, "num_tokens": 744479025.0, "step": 19508 }, { "epoch": 2.4817453250222616, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.22935104370117, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8632079362869263, "num_tokens": 744514165.0, "step": 19509 }, { "epoch": 2.481872535300852, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.69985580444336, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8585585355758667, "num_tokens": 744547210.0, "step": 19510 }, { "epoch": 2.4819997455794427, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.0125846862793, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8631192445755005, "num_tokens": 744585132.0, "step": 19511 }, { "epoch": 2.4821269558580332, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.90702819824219, "learning_rate": 1e-06, "loss": 0.5632, "mean_token_accuracy": 0.8755265474319458, "num_tokens": 744617921.0, "step": 19512 }, { "epoch": 2.4822541661366238, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.06969451904297, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8597142696380615, "num_tokens": 744653095.0, "step": 19513 }, { "epoch": 2.4823813764152143, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.796573638916016, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8689165711402893, "num_tokens": 744688258.0, "step": 19514 }, { "epoch": 2.482508586693805, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.79235076904297, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8626447916030884, "num_tokens": 744723500.0, "step": 19515 }, { "epoch": 2.4826357969723953, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.565574645996094, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8643745183944702, "num_tokens": 744757133.0, "step": 19516 }, { "epoch": 2.482763007250986, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.22664260864258, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8745325207710266, "num_tokens": 744793814.0, "step": 19517 }, { "epoch": 2.4828902175295764, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.15842056274414, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8659607172012329, "num_tokens": 744828680.0, "step": 19518 }, { "epoch": 2.483017427808167, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.168338775634766, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8595432639122009, "num_tokens": 744861642.0, "step": 19519 }, { "epoch": 2.4831446380867574, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.17517852783203, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.874029815196991, "num_tokens": 744900345.0, "step": 19520 }, { "epoch": 2.483271848365348, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 48.87565231323242, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8698782920837402, "num_tokens": 744938589.0, "step": 19521 }, { "epoch": 2.4833990586439385, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.35454559326172, "learning_rate": 1e-06, "loss": 0.6612, "mean_token_accuracy": 0.8524194359779358, "num_tokens": 744975385.0, "step": 19522 }, { "epoch": 2.483526268922529, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.852447509765625, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8538820147514343, "num_tokens": 745010860.0, "step": 19523 }, { "epoch": 2.4836534792011196, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.23926544189453, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8548545241355896, "num_tokens": 745047183.0, "step": 19524 }, { "epoch": 2.48378068947971, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.19896697998047, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8595852851867676, "num_tokens": 745081906.0, "step": 19525 }, { "epoch": 2.4839078997583006, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.49380111694336, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8685163855552673, "num_tokens": 745127285.0, "step": 19526 }, { "epoch": 2.484035110036891, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.778045654296875, "learning_rate": 1e-06, "loss": 0.5549, "mean_token_accuracy": 0.8790145516395569, "num_tokens": 745171564.0, "step": 19527 }, { "epoch": 2.4841623203154817, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.266021728515625, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8735625743865967, "num_tokens": 745214252.0, "step": 19528 }, { "epoch": 2.484289530594072, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.97970962524414, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8655545711517334, "num_tokens": 745251983.0, "step": 19529 }, { "epoch": 2.4844167408726623, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.744468688964844, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8661320209503174, "num_tokens": 745287037.0, "step": 19530 }, { "epoch": 2.4845439511512533, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.013427734375, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8610486388206482, "num_tokens": 745327854.0, "step": 19531 }, { "epoch": 2.4846711614298433, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.57865524291992, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8510720133781433, "num_tokens": 745370227.0, "step": 19532 }, { "epoch": 2.484798371708434, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.30713653564453, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8610532283782959, "num_tokens": 745409416.0, "step": 19533 }, { "epoch": 2.4849255819870244, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.44914245605469, "learning_rate": 1e-06, "loss": 0.6657, "mean_token_accuracy": 0.8440312743186951, "num_tokens": 745449908.0, "step": 19534 }, { "epoch": 2.485052792265615, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.95170593261719, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8511691093444824, "num_tokens": 745486384.0, "step": 19535 }, { "epoch": 2.4851800025442055, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.24028396606445, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8526734113693237, "num_tokens": 745523058.0, "step": 19536 }, { "epoch": 2.485307212822796, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.716060638427734, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8456116318702698, "num_tokens": 745559276.0, "step": 19537 }, { "epoch": 2.4854344231013865, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.47134017944336, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8619628548622131, "num_tokens": 745592726.0, "step": 19538 }, { "epoch": 2.485561633379977, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.63140106201172, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8659704923629761, "num_tokens": 745630117.0, "step": 19539 }, { "epoch": 2.4856888436585676, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.87586975097656, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.8495650291442871, "num_tokens": 745664968.0, "step": 19540 }, { "epoch": 2.485816053937158, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.76582717895508, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8641854524612427, "num_tokens": 745701893.0, "step": 19541 }, { "epoch": 2.4859432642157486, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.5254020690918, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8593555688858032, "num_tokens": 745736884.0, "step": 19542 }, { "epoch": 2.486070474494339, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.059696197509766, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8750569820404053, "num_tokens": 745782381.0, "step": 19543 }, { "epoch": 2.4861976847729297, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.51910400390625, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8782048225402832, "num_tokens": 745827630.0, "step": 19544 }, { "epoch": 2.48632489505152, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.27485275268555, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8708692789077759, "num_tokens": 745862223.0, "step": 19545 }, { "epoch": 2.4864521053301107, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.61426544189453, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8789647817611694, "num_tokens": 745903077.0, "step": 19546 }, { "epoch": 2.4865793156087013, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.959205627441406, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8636143207550049, "num_tokens": 745945534.0, "step": 19547 }, { "epoch": 2.486706525887292, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.75860595703125, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8475330471992493, "num_tokens": 745986308.0, "step": 19548 }, { "epoch": 2.4868337361658823, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.68711853027344, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8537227511405945, "num_tokens": 746022206.0, "step": 19549 }, { "epoch": 2.486960946444473, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.035911560058594, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8705973029136658, "num_tokens": 746059854.0, "step": 19550 }, { "epoch": 2.4870881567230634, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.48612976074219, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.870358943939209, "num_tokens": 746103994.0, "step": 19551 }, { "epoch": 2.487215367001654, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.20746994018555, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8533578515052795, "num_tokens": 746138491.0, "step": 19552 }, { "epoch": 2.4873425772802444, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.729576110839844, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8600797653198242, "num_tokens": 746172598.0, "step": 19553 }, { "epoch": 2.487469787558835, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.53525161743164, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8706380128860474, "num_tokens": 746210385.0, "step": 19554 }, { "epoch": 2.487596997837425, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.76131057739258, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8653768301010132, "num_tokens": 746252563.0, "step": 19555 }, { "epoch": 2.487724208116016, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.673912048339844, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8685692548751831, "num_tokens": 746292356.0, "step": 19556 }, { "epoch": 2.487851418394606, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.819602966308594, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.851386547088623, "num_tokens": 746325467.0, "step": 19557 }, { "epoch": 2.4879786286731966, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.7916145324707, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8573227524757385, "num_tokens": 746362285.0, "step": 19558 }, { "epoch": 2.488105838951787, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.629249572753906, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8716821670532227, "num_tokens": 746401540.0, "step": 19559 }, { "epoch": 2.4882330492303777, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.288944244384766, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8647085428237915, "num_tokens": 746439282.0, "step": 19560 }, { "epoch": 2.488360259508968, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.40319061279297, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.874518632888794, "num_tokens": 746480993.0, "step": 19561 }, { "epoch": 2.4884874697875587, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.885231018066406, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8604686260223389, "num_tokens": 746519318.0, "step": 19562 }, { "epoch": 2.4886146800661493, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.67671585083008, "learning_rate": 1e-06, "loss": 0.6582, "mean_token_accuracy": 0.8477717638015747, "num_tokens": 746559761.0, "step": 19563 }, { "epoch": 2.48874189034474, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.112274169921875, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8713378310203552, "num_tokens": 746591622.0, "step": 19564 }, { "epoch": 2.4888691006233303, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.39498519897461, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8594027757644653, "num_tokens": 746627709.0, "step": 19565 }, { "epoch": 2.488996310901921, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 48.94586944580078, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8602980375289917, "num_tokens": 746663445.0, "step": 19566 }, { "epoch": 2.4891235211805114, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.23017883300781, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8516287803649902, "num_tokens": 746706869.0, "step": 19567 }, { "epoch": 2.489250731459102, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.68928527832031, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8746300339698792, "num_tokens": 746749502.0, "step": 19568 }, { "epoch": 2.4893779417376924, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.99778366088867, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8671465516090393, "num_tokens": 746786510.0, "step": 19569 }, { "epoch": 2.489505152016283, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.531803131103516, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8669314384460449, "num_tokens": 746819430.0, "step": 19570 }, { "epoch": 2.4896323622948735, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.092750549316406, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8664439916610718, "num_tokens": 746856018.0, "step": 19571 }, { "epoch": 2.489759572573464, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.894935607910156, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8577927947044373, "num_tokens": 746897350.0, "step": 19572 }, { "epoch": 2.4898867828520546, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.95465850830078, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8546344041824341, "num_tokens": 746937163.0, "step": 19573 }, { "epoch": 2.490013993130645, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.89729309082031, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8723971843719482, "num_tokens": 746976140.0, "step": 19574 }, { "epoch": 2.4901412034092356, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.2176513671875, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8624831438064575, "num_tokens": 747012576.0, "step": 19575 }, { "epoch": 2.490268413687826, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.74404525756836, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8612473011016846, "num_tokens": 747048179.0, "step": 19576 }, { "epoch": 2.4903956239664167, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.2579460144043, "learning_rate": 1e-06, "loss": 0.6564, "mean_token_accuracy": 0.8447514772415161, "num_tokens": 747085291.0, "step": 19577 }, { "epoch": 2.4905228342450068, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.60527038574219, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.869138240814209, "num_tokens": 747119085.0, "step": 19578 }, { "epoch": 2.4906500445235977, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.13825988769531, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8614135384559631, "num_tokens": 747153463.0, "step": 19579 }, { "epoch": 2.490777254802188, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.430885314941406, "learning_rate": 1e-06, "loss": 0.6849, "mean_token_accuracy": 0.8372593522071838, "num_tokens": 747194781.0, "step": 19580 }, { "epoch": 2.4909044650807783, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.32429885864258, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8650046586990356, "num_tokens": 747235328.0, "step": 19581 }, { "epoch": 2.491031675359369, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.44496154785156, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8593931794166565, "num_tokens": 747278481.0, "step": 19582 }, { "epoch": 2.4911588856379594, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.35698318481445, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8786841630935669, "num_tokens": 747313077.0, "step": 19583 }, { "epoch": 2.49128609591655, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.61449432373047, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8692293167114258, "num_tokens": 747352693.0, "step": 19584 }, { "epoch": 2.4914133061951405, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.27450942993164, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.854726254940033, "num_tokens": 747397717.0, "step": 19585 }, { "epoch": 2.491540516473731, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.458961486816406, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8661206960678101, "num_tokens": 747435457.0, "step": 19586 }, { "epoch": 2.4916677267523215, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.246952056884766, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8674272298812866, "num_tokens": 747479017.0, "step": 19587 }, { "epoch": 2.491794937030912, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.37040710449219, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8621858358383179, "num_tokens": 747520247.0, "step": 19588 }, { "epoch": 2.4919221473095026, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.96732711791992, "learning_rate": 1e-06, "loss": 0.6541, "mean_token_accuracy": 0.849443256855011, "num_tokens": 747559983.0, "step": 19589 }, { "epoch": 2.492049357588093, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.925086975097656, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8581693172454834, "num_tokens": 747595746.0, "step": 19590 }, { "epoch": 2.4921765678666836, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.81801223754883, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8594298362731934, "num_tokens": 747639261.0, "step": 19591 }, { "epoch": 2.492303778145274, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.683589935302734, "learning_rate": 1e-06, "loss": 0.6615, "mean_token_accuracy": 0.8438175320625305, "num_tokens": 747680665.0, "step": 19592 }, { "epoch": 2.4924309884238647, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.784793853759766, "learning_rate": 1e-06, "loss": 0.5923, "mean_token_accuracy": 0.867886483669281, "num_tokens": 747713320.0, "step": 19593 }, { "epoch": 2.492558198702455, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.49936294555664, "learning_rate": 1e-06, "loss": 0.7031, "mean_token_accuracy": 0.8365582227706909, "num_tokens": 747752167.0, "step": 19594 }, { "epoch": 2.4926854089810457, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.73858642578125, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.867388904094696, "num_tokens": 747787274.0, "step": 19595 }, { "epoch": 2.4928126192596363, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.8579216003418, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8573158979415894, "num_tokens": 747826647.0, "step": 19596 }, { "epoch": 2.492939829538227, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.014041900634766, "learning_rate": 1e-06, "loss": 0.6216, "mean_token_accuracy": 0.8579222559928894, "num_tokens": 747866797.0, "step": 19597 }, { "epoch": 2.4930670398168173, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.859371185302734, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8631473779678345, "num_tokens": 747907589.0, "step": 19598 }, { "epoch": 2.493194250095408, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.60296630859375, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8547163009643555, "num_tokens": 747939865.0, "step": 19599 }, { "epoch": 2.4933214603739984, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.01395797729492, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8597396612167358, "num_tokens": 747976238.0, "step": 19600 }, { "epoch": 2.493448670652589, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.680599212646484, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8600667715072632, "num_tokens": 748012076.0, "step": 19601 }, { "epoch": 2.4935758809311794, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.036529541015625, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8557334542274475, "num_tokens": 748042518.0, "step": 19602 }, { "epoch": 2.4937030912097695, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.11997985839844, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8704367280006409, "num_tokens": 748077509.0, "step": 19603 }, { "epoch": 2.4938303014883605, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.612770080566406, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8679894208908081, "num_tokens": 748118164.0, "step": 19604 }, { "epoch": 2.4939575117669506, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.78000259399414, "learning_rate": 1e-06, "loss": 0.6311, "mean_token_accuracy": 0.8552291393280029, "num_tokens": 748153609.0, "step": 19605 }, { "epoch": 2.494084722045541, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.4603157043457, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8604906797409058, "num_tokens": 748189573.0, "step": 19606 }, { "epoch": 2.4942119323241316, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.04277420043945, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.856995701789856, "num_tokens": 748231121.0, "step": 19607 }, { "epoch": 2.494339142602722, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.058345794677734, "learning_rate": 1e-06, "loss": 0.5607, "mean_token_accuracy": 0.8730118870735168, "num_tokens": 748268744.0, "step": 19608 }, { "epoch": 2.4944663528813127, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.1794319152832, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8670110702514648, "num_tokens": 748307441.0, "step": 19609 }, { "epoch": 2.494593563159903, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.48442077636719, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.861727774143219, "num_tokens": 748343506.0, "step": 19610 }, { "epoch": 2.4947207734384937, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.79536056518555, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8634171485900879, "num_tokens": 748382209.0, "step": 19611 }, { "epoch": 2.4948479837170843, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.776344299316406, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8834277391433716, "num_tokens": 748413660.0, "step": 19612 }, { "epoch": 2.494975193995675, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.197078704833984, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.864057183265686, "num_tokens": 748457037.0, "step": 19613 }, { "epoch": 2.4951024042742653, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.4033088684082, "learning_rate": 1e-06, "loss": 0.6807, "mean_token_accuracy": 0.8410312533378601, "num_tokens": 748497227.0, "step": 19614 }, { "epoch": 2.495229614552856, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.390716552734375, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8589534759521484, "num_tokens": 748529901.0, "step": 19615 }, { "epoch": 2.4953568248314464, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.29188919067383, "learning_rate": 1e-06, "loss": 0.6516, "mean_token_accuracy": 0.8484350442886353, "num_tokens": 748568238.0, "step": 19616 }, { "epoch": 2.495484035110037, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.69245910644531, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8519399166107178, "num_tokens": 748608153.0, "step": 19617 }, { "epoch": 2.4956112453886274, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.15625762939453, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8583523631095886, "num_tokens": 748646977.0, "step": 19618 }, { "epoch": 2.495738455667218, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.970863342285156, "learning_rate": 1e-06, "loss": 0.6442, "mean_token_accuracy": 0.8519859313964844, "num_tokens": 748689586.0, "step": 19619 }, { "epoch": 2.4958656659458085, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.256591796875, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8609207272529602, "num_tokens": 748730915.0, "step": 19620 }, { "epoch": 2.495992876224399, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.73749542236328, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8572496175765991, "num_tokens": 748766996.0, "step": 19621 }, { "epoch": 2.4961200865029896, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.38938522338867, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8535533547401428, "num_tokens": 748801225.0, "step": 19622 }, { "epoch": 2.49624729678158, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.66652297973633, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8573373556137085, "num_tokens": 748841355.0, "step": 19623 }, { "epoch": 2.4963745070601706, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.83308410644531, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.8617508411407471, "num_tokens": 748879636.0, "step": 19624 }, { "epoch": 2.496501717338761, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.00546646118164, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.8579890131950378, "num_tokens": 748917795.0, "step": 19625 }, { "epoch": 2.4966289276173517, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.85840606689453, "learning_rate": 1e-06, "loss": 0.6982, "mean_token_accuracy": 0.8338877558708191, "num_tokens": 748956512.0, "step": 19626 }, { "epoch": 2.496756137895942, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.81165313720703, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8654878735542297, "num_tokens": 748995577.0, "step": 19627 }, { "epoch": 2.4968833481745323, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.905826568603516, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.857992947101593, "num_tokens": 749027672.0, "step": 19628 }, { "epoch": 2.4970105584531233, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.09232711791992, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8715980052947998, "num_tokens": 749059101.0, "step": 19629 }, { "epoch": 2.4971377687317133, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.23075866699219, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8466631174087524, "num_tokens": 749097034.0, "step": 19630 }, { "epoch": 2.497264979010304, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.91871643066406, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.859297513961792, "num_tokens": 749137536.0, "step": 19631 }, { "epoch": 2.4973921892888944, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.70045852661133, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8617475032806396, "num_tokens": 749171487.0, "step": 19632 }, { "epoch": 2.497519399567485, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.29988098144531, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8546526432037354, "num_tokens": 749213234.0, "step": 19633 }, { "epoch": 2.4976466098460754, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.58025360107422, "learning_rate": 1e-06, "loss": 0.6516, "mean_token_accuracy": 0.8504493832588196, "num_tokens": 749256184.0, "step": 19634 }, { "epoch": 2.497773820124666, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.424137115478516, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.858592689037323, "num_tokens": 749292909.0, "step": 19635 }, { "epoch": 2.4979010304032565, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.55818176269531, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8490523099899292, "num_tokens": 749327947.0, "step": 19636 }, { "epoch": 2.498028240681847, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.53507995605469, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8697898387908936, "num_tokens": 749364107.0, "step": 19637 }, { "epoch": 2.4981554509604376, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.726280212402344, "learning_rate": 1e-06, "loss": 0.5548, "mean_token_accuracy": 0.8779045343399048, "num_tokens": 749398055.0, "step": 19638 }, { "epoch": 2.498282661239028, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.28345489501953, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8602176904678345, "num_tokens": 749434626.0, "step": 19639 }, { "epoch": 2.4984098715176186, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.47294616699219, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8554728627204895, "num_tokens": 749470616.0, "step": 19640 }, { "epoch": 2.498537081796209, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.07379150390625, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8648728728294373, "num_tokens": 749513072.0, "step": 19641 }, { "epoch": 2.4986642920747997, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.72795867919922, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8748998045921326, "num_tokens": 749546988.0, "step": 19642 }, { "epoch": 2.49879150235339, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.06633377075195, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8538371920585632, "num_tokens": 749585851.0, "step": 19643 }, { "epoch": 2.4989187126319807, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.43556594848633, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.858553409576416, "num_tokens": 749619510.0, "step": 19644 }, { "epoch": 2.4990459229105713, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.157203674316406, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8695870041847229, "num_tokens": 749653660.0, "step": 19645 }, { "epoch": 2.499173133189162, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.423858642578125, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8617337346076965, "num_tokens": 749691321.0, "step": 19646 }, { "epoch": 2.4993003434677523, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.22111892700195, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.861535370349884, "num_tokens": 749729305.0, "step": 19647 }, { "epoch": 2.499427553746343, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.19761276245117, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8680880069732666, "num_tokens": 749766134.0, "step": 19648 }, { "epoch": 2.4995547640249334, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.18730545043945, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.857779860496521, "num_tokens": 749807715.0, "step": 19649 }, { "epoch": 2.499681974303524, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.02922821044922, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.862646222114563, "num_tokens": 749838799.0, "step": 19650 }, { "epoch": 2.499809184582114, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.50729751586914, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8751702904701233, "num_tokens": 749874283.0, "step": 19651 }, { "epoch": 2.499936394860705, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.28431701660156, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8590375185012817, "num_tokens": 749915912.0, "step": 19652 }, { "epoch": 2.500063605139295, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.593074798583984, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.867722749710083, "num_tokens": 749959133.0, "step": 19653 }, { "epoch": 2.500190815417886, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.47222900390625, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8593443036079407, "num_tokens": 749997846.0, "step": 19654 }, { "epoch": 2.500318025696476, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.71732711791992, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8671574592590332, "num_tokens": 750040973.0, "step": 19655 }, { "epoch": 2.5004452359750666, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.35628128051758, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8566712141036987, "num_tokens": 750082171.0, "step": 19656 }, { "epoch": 2.500572446253657, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.297950744628906, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8732343316078186, "num_tokens": 750123083.0, "step": 19657 }, { "epoch": 2.5006996565322477, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.079708099365234, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.847618579864502, "num_tokens": 750162235.0, "step": 19658 }, { "epoch": 2.500826866810838, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.769866943359375, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8738749623298645, "num_tokens": 750196501.0, "step": 19659 }, { "epoch": 2.5009540770894287, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.93646240234375, "learning_rate": 1e-06, "loss": 0.7103, "mean_token_accuracy": 0.8334478139877319, "num_tokens": 750241178.0, "step": 19660 }, { "epoch": 2.5010812873680193, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.70612716674805, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.866633951663971, "num_tokens": 750284104.0, "step": 19661 }, { "epoch": 2.50120849764661, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.75779342651367, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8671251535415649, "num_tokens": 750318525.0, "step": 19662 }, { "epoch": 2.5013357079252003, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.92892074584961, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8703176379203796, "num_tokens": 750358892.0, "step": 19663 }, { "epoch": 2.501462918203791, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.403987884521484, "learning_rate": 1e-06, "loss": 0.6473, "mean_token_accuracy": 0.8480710387229919, "num_tokens": 750399325.0, "step": 19664 }, { "epoch": 2.5015901284823814, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.6532096862793, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8562933206558228, "num_tokens": 750437493.0, "step": 19665 }, { "epoch": 2.501717338760972, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.222557067871094, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8686250448226929, "num_tokens": 750472381.0, "step": 19666 }, { "epoch": 2.5018445490395624, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.33179473876953, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8435690999031067, "num_tokens": 750511786.0, "step": 19667 }, { "epoch": 2.501971759318153, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.14250564575195, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8677679300308228, "num_tokens": 750550596.0, "step": 19668 }, { "epoch": 2.5020989695967435, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.37251281738281, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8650491833686829, "num_tokens": 750594790.0, "step": 19669 }, { "epoch": 2.502226179875334, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.749549865722656, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8723363876342773, "num_tokens": 750636738.0, "step": 19670 }, { "epoch": 2.5023533901539246, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.138580322265625, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.873762845993042, "num_tokens": 750672338.0, "step": 19671 }, { "epoch": 2.502480600432515, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 48.9750862121582, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8548532128334045, "num_tokens": 750709149.0, "step": 19672 }, { "epoch": 2.5026078107111056, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.999874114990234, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8736749291419983, "num_tokens": 750741727.0, "step": 19673 }, { "epoch": 2.5027350209896957, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.01723861694336, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8504852056503296, "num_tokens": 750775903.0, "step": 19674 }, { "epoch": 2.5028622312682867, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.311683654785156, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8551023602485657, "num_tokens": 750810502.0, "step": 19675 }, { "epoch": 2.5029894415468767, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.16919708251953, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8706116676330566, "num_tokens": 750847762.0, "step": 19676 }, { "epoch": 2.5031166518254677, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.608306884765625, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8631250262260437, "num_tokens": 750887201.0, "step": 19677 }, { "epoch": 2.503243862104058, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.01144790649414, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8598930239677429, "num_tokens": 750927323.0, "step": 19678 }, { "epoch": 2.5033710723826488, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.12336730957031, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8521257042884827, "num_tokens": 750972501.0, "step": 19679 }, { "epoch": 2.503498282661239, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.152870178222656, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8711563944816589, "num_tokens": 751010743.0, "step": 19680 }, { "epoch": 2.5036254929398294, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.64348602294922, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8563492894172668, "num_tokens": 751050440.0, "step": 19681 }, { "epoch": 2.50375270321842, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.4923095703125, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8652123212814331, "num_tokens": 751088319.0, "step": 19682 }, { "epoch": 2.5038799134970104, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.386802673339844, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8616663217544556, "num_tokens": 751123766.0, "step": 19683 }, { "epoch": 2.504007123775601, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.308921813964844, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8787652850151062, "num_tokens": 751163375.0, "step": 19684 }, { "epoch": 2.5041343340541915, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.72495651245117, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.857787013053894, "num_tokens": 751204526.0, "step": 19685 }, { "epoch": 2.504261544332782, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.74196243286133, "learning_rate": 1e-06, "loss": 0.5884, "mean_token_accuracy": 0.8656810522079468, "num_tokens": 751245133.0, "step": 19686 }, { "epoch": 2.5043887546113726, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.0538215637207, "learning_rate": 1e-06, "loss": 0.6457, "mean_token_accuracy": 0.851010799407959, "num_tokens": 751284578.0, "step": 19687 }, { "epoch": 2.504515964889963, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.05034637451172, "learning_rate": 1e-06, "loss": 0.6348, "mean_token_accuracy": 0.8536888360977173, "num_tokens": 751325794.0, "step": 19688 }, { "epoch": 2.5046431751685536, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.209877014160156, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8672134280204773, "num_tokens": 751363777.0, "step": 19689 }, { "epoch": 2.504770385447144, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.775779724121094, "learning_rate": 1e-06, "loss": 0.6481, "mean_token_accuracy": 0.8534836173057556, "num_tokens": 751400499.0, "step": 19690 }, { "epoch": 2.5048975957257347, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.078060150146484, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8575888872146606, "num_tokens": 751434895.0, "step": 19691 }, { "epoch": 2.505024806004325, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.61758804321289, "learning_rate": 1e-06, "loss": 0.5739, "mean_token_accuracy": 0.8727878332138062, "num_tokens": 751473984.0, "step": 19692 }, { "epoch": 2.5051520162829157, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.3773193359375, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8613865375518799, "num_tokens": 751515030.0, "step": 19693 }, { "epoch": 2.5052792265615063, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.694950103759766, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.857774019241333, "num_tokens": 751550023.0, "step": 19694 }, { "epoch": 2.505406436840097, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.618797302246094, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8625431060791016, "num_tokens": 751584793.0, "step": 19695 }, { "epoch": 2.5055336471186873, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.0023078918457, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8432353138923645, "num_tokens": 751619817.0, "step": 19696 }, { "epoch": 2.505660857397278, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.69536590576172, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8685647249221802, "num_tokens": 751657097.0, "step": 19697 }, { "epoch": 2.5057880676758684, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.993873596191406, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8505077958106995, "num_tokens": 751687042.0, "step": 19698 }, { "epoch": 2.5059152779544585, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.64545440673828, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8707102537155151, "num_tokens": 751724281.0, "step": 19699 }, { "epoch": 2.5060424882330494, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.81173324584961, "learning_rate": 1e-06, "loss": 0.6727, "mean_token_accuracy": 0.8427630662918091, "num_tokens": 751765144.0, "step": 19700 }, { "epoch": 2.5061696985116395, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.761173248291016, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8690828084945679, "num_tokens": 751799602.0, "step": 19701 }, { "epoch": 2.5062969087902305, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.95000076293945, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8707262277603149, "num_tokens": 751836972.0, "step": 19702 }, { "epoch": 2.5064241190688206, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.99630355834961, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8617526292800903, "num_tokens": 751869627.0, "step": 19703 }, { "epoch": 2.5065513293474115, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.44588088989258, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.8543773293495178, "num_tokens": 751911103.0, "step": 19704 }, { "epoch": 2.5066785396260016, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.12908935546875, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8551164269447327, "num_tokens": 751948956.0, "step": 19705 }, { "epoch": 2.506805749904592, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.34043502807617, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8647761344909668, "num_tokens": 751992185.0, "step": 19706 }, { "epoch": 2.5069329601831827, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.00538635253906, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8569442629814148, "num_tokens": 752032661.0, "step": 19707 }, { "epoch": 2.507060170461773, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.1431999206543, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8661482930183411, "num_tokens": 752073946.0, "step": 19708 }, { "epoch": 2.5071873807403637, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 49.890804290771484, "learning_rate": 1e-06, "loss": 0.6624, "mean_token_accuracy": 0.8464722633361816, "num_tokens": 752118766.0, "step": 19709 }, { "epoch": 2.5073145910189543, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.605621337890625, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8651044368743896, "num_tokens": 752159669.0, "step": 19710 }, { "epoch": 2.507441801297545, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.613914489746094, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8636445999145508, "num_tokens": 752198139.0, "step": 19711 }, { "epoch": 2.5075690115761353, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.65678024291992, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8637863397598267, "num_tokens": 752239686.0, "step": 19712 }, { "epoch": 2.507696221854726, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.06129837036133, "learning_rate": 1e-06, "loss": 0.656, "mean_token_accuracy": 0.8446934819221497, "num_tokens": 752275740.0, "step": 19713 }, { "epoch": 2.5078234321333164, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.80938720703125, "learning_rate": 1e-06, "loss": 0.6563, "mean_token_accuracy": 0.8479271531105042, "num_tokens": 752310554.0, "step": 19714 }, { "epoch": 2.507950642411907, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.741363525390625, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8670885562896729, "num_tokens": 752347761.0, "step": 19715 }, { "epoch": 2.5080778526904974, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.114479064941406, "learning_rate": 1e-06, "loss": 0.6556, "mean_token_accuracy": 0.8500173091888428, "num_tokens": 752387128.0, "step": 19716 }, { "epoch": 2.508205062969088, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.99390411376953, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8593845367431641, "num_tokens": 752428722.0, "step": 19717 }, { "epoch": 2.5083322732476785, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.81591033935547, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8600130081176758, "num_tokens": 752468920.0, "step": 19718 }, { "epoch": 2.508459483526269, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.30690002441406, "learning_rate": 1e-06, "loss": 0.6577, "mean_token_accuracy": 0.8522720336914062, "num_tokens": 752505226.0, "step": 19719 }, { "epoch": 2.5085866938048595, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.844215393066406, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8633512258529663, "num_tokens": 752543693.0, "step": 19720 }, { "epoch": 2.50871390408345, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.91913604736328, "learning_rate": 1e-06, "loss": 0.6582, "mean_token_accuracy": 0.8451248407363892, "num_tokens": 752578758.0, "step": 19721 }, { "epoch": 2.5088411143620406, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.768409729003906, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8729667067527771, "num_tokens": 752612115.0, "step": 19722 }, { "epoch": 2.508968324640631, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.95035934448242, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8676825761795044, "num_tokens": 752644601.0, "step": 19723 }, { "epoch": 2.509095534919221, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.81831741333008, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8662495017051697, "num_tokens": 752682333.0, "step": 19724 }, { "epoch": 2.509222745197812, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.893619537353516, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8571956157684326, "num_tokens": 752721802.0, "step": 19725 }, { "epoch": 2.5093499554764023, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.72123336791992, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.875973105430603, "num_tokens": 752755673.0, "step": 19726 }, { "epoch": 2.5094771657549932, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.56510543823242, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8721665740013123, "num_tokens": 752795575.0, "step": 19727 }, { "epoch": 2.5096043760335833, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.35731506347656, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8776998519897461, "num_tokens": 752833172.0, "step": 19728 }, { "epoch": 2.5097315863121743, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.055870056152344, "learning_rate": 1e-06, "loss": 0.6506, "mean_token_accuracy": 0.8462375402450562, "num_tokens": 752872857.0, "step": 19729 }, { "epoch": 2.5098587965907644, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.32990646362305, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8717743158340454, "num_tokens": 752908553.0, "step": 19730 }, { "epoch": 2.509986006869355, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.933860778808594, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8666883111000061, "num_tokens": 752937762.0, "step": 19731 }, { "epoch": 2.5101132171479454, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.859283447265625, "learning_rate": 1e-06, "loss": 0.6454, "mean_token_accuracy": 0.8479492664337158, "num_tokens": 752977184.0, "step": 19732 }, { "epoch": 2.510240427426536, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.90020751953125, "learning_rate": 1e-06, "loss": 0.6137, "mean_token_accuracy": 0.8564387559890747, "num_tokens": 753021071.0, "step": 19733 }, { "epoch": 2.5103676377051265, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.59467697143555, "learning_rate": 1e-06, "loss": 0.6935, "mean_token_accuracy": 0.8352285027503967, "num_tokens": 753056597.0, "step": 19734 }, { "epoch": 2.510494847983717, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.12238311767578, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8640350103378296, "num_tokens": 753096178.0, "step": 19735 }, { "epoch": 2.5106220582623076, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.53025817871094, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8649981021881104, "num_tokens": 753130388.0, "step": 19736 }, { "epoch": 2.510749268540898, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.743595123291016, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8701569437980652, "num_tokens": 753166585.0, "step": 19737 }, { "epoch": 2.5108764788194886, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.66374969482422, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8713805675506592, "num_tokens": 753203403.0, "step": 19738 }, { "epoch": 2.511003689098079, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.6812629699707, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8669931888580322, "num_tokens": 753241632.0, "step": 19739 }, { "epoch": 2.5111308993766697, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.765586853027344, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8760664463043213, "num_tokens": 753279292.0, "step": 19740 }, { "epoch": 2.51125810965526, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.83289337158203, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8513034582138062, "num_tokens": 753319121.0, "step": 19741 }, { "epoch": 2.5113853199338507, "ewc_loss": 0.203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.98259735107422, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8468564748764038, "num_tokens": 753353074.0, "step": 19742 }, { "epoch": 2.5115125302124413, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.932518005371094, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8808197975158691, "num_tokens": 753388994.0, "step": 19743 }, { "epoch": 2.511639740491032, "ewc_loss": 0.203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.13967514038086, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8684871792793274, "num_tokens": 753430271.0, "step": 19744 }, { "epoch": 2.5117669507696223, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.82680130004883, "learning_rate": 1e-06, "loss": 0.6386, "mean_token_accuracy": 0.8543251156806946, "num_tokens": 753473820.0, "step": 19745 }, { "epoch": 2.511894161048213, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.46369934082031, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8573265075683594, "num_tokens": 753516397.0, "step": 19746 }, { "epoch": 2.5120213713268034, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.70916748046875, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8727887272834778, "num_tokens": 753557589.0, "step": 19747 }, { "epoch": 2.512148581605394, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.38317108154297, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.871008038520813, "num_tokens": 753593764.0, "step": 19748 }, { "epoch": 2.512275791883984, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.11902618408203, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8485827445983887, "num_tokens": 753635967.0, "step": 19749 }, { "epoch": 2.512403002162575, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.5294189453125, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8701265454292297, "num_tokens": 753674595.0, "step": 19750 }, { "epoch": 2.512530212441165, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.29638671875, "learning_rate": 1e-06, "loss": 0.6663, "mean_token_accuracy": 0.8456129431724548, "num_tokens": 753714118.0, "step": 19751 }, { "epoch": 2.512657422719756, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.22214889526367, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8478034138679504, "num_tokens": 753758139.0, "step": 19752 }, { "epoch": 2.512784632998346, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.07475662231445, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8518497943878174, "num_tokens": 753800373.0, "step": 19753 }, { "epoch": 2.5129118432769366, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.09128952026367, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8568111658096313, "num_tokens": 753832366.0, "step": 19754 }, { "epoch": 2.513039053555527, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.239498138427734, "learning_rate": 1e-06, "loss": 0.553, "mean_token_accuracy": 0.8811018466949463, "num_tokens": 753866176.0, "step": 19755 }, { "epoch": 2.5131662638341177, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.27869415283203, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8600718975067139, "num_tokens": 753911470.0, "step": 19756 }, { "epoch": 2.513293474112708, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.400291442871094, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8590919971466064, "num_tokens": 753953027.0, "step": 19757 }, { "epoch": 2.5134206843912987, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.28240203857422, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8558339476585388, "num_tokens": 753987992.0, "step": 19758 }, { "epoch": 2.5135478946698893, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.43952178955078, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8663517832756042, "num_tokens": 754026671.0, "step": 19759 }, { "epoch": 2.51367510494848, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.52975082397461, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8614774942398071, "num_tokens": 754063526.0, "step": 19760 }, { "epoch": 2.5138023152270703, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.95521926879883, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8451453447341919, "num_tokens": 754103361.0, "step": 19761 }, { "epoch": 2.513929525505661, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.51285171508789, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8653914928436279, "num_tokens": 754141969.0, "step": 19762 }, { "epoch": 2.5140567357842514, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.05656051635742, "learning_rate": 1e-06, "loss": 0.5521, "mean_token_accuracy": 0.8770913481712341, "num_tokens": 754182691.0, "step": 19763 }, { "epoch": 2.514183946062842, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.566471099853516, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8604153990745544, "num_tokens": 754219956.0, "step": 19764 }, { "epoch": 2.5143111563414324, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.809513092041016, "learning_rate": 1e-06, "loss": 0.6204, "mean_token_accuracy": 0.8564115762710571, "num_tokens": 754259467.0, "step": 19765 }, { "epoch": 2.514438366620023, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.787811279296875, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8655521869659424, "num_tokens": 754300344.0, "step": 19766 }, { "epoch": 2.5145655768986135, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.43026351928711, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8630260229110718, "num_tokens": 754342606.0, "step": 19767 }, { "epoch": 2.514692787177204, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.29003143310547, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.8581871390342712, "num_tokens": 754379946.0, "step": 19768 }, { "epoch": 2.5148199974557945, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.19556427001953, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8758476972579956, "num_tokens": 754414353.0, "step": 19769 }, { "epoch": 2.514947207734385, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.52705764770508, "learning_rate": 1e-06, "loss": 0.6565, "mean_token_accuracy": 0.8477219343185425, "num_tokens": 754455506.0, "step": 19770 }, { "epoch": 2.5150744180129756, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.905155181884766, "learning_rate": 1e-06, "loss": 0.6613, "mean_token_accuracy": 0.8436093330383301, "num_tokens": 754498316.0, "step": 19771 }, { "epoch": 2.5152016282915657, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.74448013305664, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8481767177581787, "num_tokens": 754534943.0, "step": 19772 }, { "epoch": 2.5153288385701567, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.973445892333984, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8589642643928528, "num_tokens": 754572775.0, "step": 19773 }, { "epoch": 2.5154560488487467, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.742408752441406, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8714737892150879, "num_tokens": 754610473.0, "step": 19774 }, { "epoch": 2.5155832591273377, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.59621810913086, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8621715307235718, "num_tokens": 754649527.0, "step": 19775 }, { "epoch": 2.515710469405928, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.54366683959961, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8632906675338745, "num_tokens": 754691513.0, "step": 19776 }, { "epoch": 2.5158376796845188, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.270145416259766, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8479777574539185, "num_tokens": 754727932.0, "step": 19777 }, { "epoch": 2.515964889963109, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.709285736083984, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8678088188171387, "num_tokens": 754764571.0, "step": 19778 }, { "epoch": 2.5160921002416994, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.253746032714844, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8768393993377686, "num_tokens": 754805846.0, "step": 19779 }, { "epoch": 2.51621931052029, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.03768539428711, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.8714020252227783, "num_tokens": 754844016.0, "step": 19780 }, { "epoch": 2.5163465207988804, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.33334732055664, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.861940860748291, "num_tokens": 754886493.0, "step": 19781 }, { "epoch": 2.516473731077471, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.14937973022461, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8647480010986328, "num_tokens": 754928195.0, "step": 19782 }, { "epoch": 2.5166009413560615, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.17125701904297, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8702384233474731, "num_tokens": 754958399.0, "step": 19783 }, { "epoch": 2.516728151634652, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.11509323120117, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8668762445449829, "num_tokens": 754997926.0, "step": 19784 }, { "epoch": 2.5168553619132426, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.40106964111328, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.8687500357627869, "num_tokens": 755040577.0, "step": 19785 }, { "epoch": 2.516982572191833, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.30124282836914, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8584445118904114, "num_tokens": 755070458.0, "step": 19786 }, { "epoch": 2.5171097824704236, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.86869812011719, "learning_rate": 1e-06, "loss": 0.6768, "mean_token_accuracy": 0.8410815000534058, "num_tokens": 755117205.0, "step": 19787 }, { "epoch": 2.517236992749014, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.12026596069336, "learning_rate": 1e-06, "loss": 0.6056, "mean_token_accuracy": 0.8639647960662842, "num_tokens": 755162098.0, "step": 19788 }, { "epoch": 2.5173642030276047, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.168128967285156, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8591136932373047, "num_tokens": 755199655.0, "step": 19789 }, { "epoch": 2.517491413306195, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.972747802734375, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8757584095001221, "num_tokens": 755234165.0, "step": 19790 }, { "epoch": 2.5176186235847857, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.12632369995117, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8629747033119202, "num_tokens": 755271933.0, "step": 19791 }, { "epoch": 2.5177458338633762, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.358238220214844, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8719592094421387, "num_tokens": 755306903.0, "step": 19792 }, { "epoch": 2.5178730441419668, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.05546188354492, "learning_rate": 1e-06, "loss": 0.5778, "mean_token_accuracy": 0.8706595301628113, "num_tokens": 755347613.0, "step": 19793 }, { "epoch": 2.5180002544205573, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.2005729675293, "learning_rate": 1e-06, "loss": 0.5555, "mean_token_accuracy": 0.8804572820663452, "num_tokens": 755380541.0, "step": 19794 }, { "epoch": 2.518127464699148, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.64509582519531, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8599268198013306, "num_tokens": 755422502.0, "step": 19795 }, { "epoch": 2.5182546749777384, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.6275749206543, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8622288703918457, "num_tokens": 755463975.0, "step": 19796 }, { "epoch": 2.5183818852563284, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.39145278930664, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8633803129196167, "num_tokens": 755504873.0, "step": 19797 }, { "epoch": 2.5185090955349194, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.1401481628418, "learning_rate": 1e-06, "loss": 0.6841, "mean_token_accuracy": 0.837496280670166, "num_tokens": 755545646.0, "step": 19798 }, { "epoch": 2.5186363058135095, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.46272277832031, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8483083248138428, "num_tokens": 755577228.0, "step": 19799 }, { "epoch": 2.5187635160921005, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.06071090698242, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8719210624694824, "num_tokens": 755621437.0, "step": 19800 }, { "epoch": 2.5188907263706906, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.18696212768555, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8616848587989807, "num_tokens": 755658266.0, "step": 19801 }, { "epoch": 2.5190179366492815, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.49643325805664, "learning_rate": 1e-06, "loss": 0.6426, "mean_token_accuracy": 0.8505069017410278, "num_tokens": 755696755.0, "step": 19802 }, { "epoch": 2.5191451469278716, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.29816436767578, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.87615966796875, "num_tokens": 755732434.0, "step": 19803 }, { "epoch": 2.519272357206462, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.59907150268555, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8660829663276672, "num_tokens": 755768488.0, "step": 19804 }, { "epoch": 2.5193995674850527, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.891571044921875, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.867303192615509, "num_tokens": 755799637.0, "step": 19805 }, { "epoch": 2.519526777763643, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.49430465698242, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8470448851585388, "num_tokens": 755834229.0, "step": 19806 }, { "epoch": 2.5196539880422337, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.451290130615234, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8661584854125977, "num_tokens": 755871776.0, "step": 19807 }, { "epoch": 2.5197811983208243, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.618350982666016, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8728381395339966, "num_tokens": 755911125.0, "step": 19808 }, { "epoch": 2.519908408599415, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.842918395996094, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8710404634475708, "num_tokens": 755952204.0, "step": 19809 }, { "epoch": 2.5200356188780053, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.770572662353516, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8517318964004517, "num_tokens": 755996702.0, "step": 19810 }, { "epoch": 2.520162829156596, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.79323959350586, "learning_rate": 1e-06, "loss": 0.5511, "mean_token_accuracy": 0.8805885314941406, "num_tokens": 756033285.0, "step": 19811 }, { "epoch": 2.5202900394351864, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.4693489074707, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8624721169471741, "num_tokens": 756068851.0, "step": 19812 }, { "epoch": 2.520417249713777, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.97317886352539, "learning_rate": 1e-06, "loss": 0.6408, "mean_token_accuracy": 0.8544709086418152, "num_tokens": 756111676.0, "step": 19813 }, { "epoch": 2.5205444599923674, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.53292465209961, "learning_rate": 1e-06, "loss": 0.6776, "mean_token_accuracy": 0.844514787197113, "num_tokens": 756146575.0, "step": 19814 }, { "epoch": 2.520671670270958, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.94919204711914, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8523877263069153, "num_tokens": 756190006.0, "step": 19815 }, { "epoch": 2.5207988805495485, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.30949783325195, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8744439482688904, "num_tokens": 756232828.0, "step": 19816 }, { "epoch": 2.520926090828139, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.457149505615234, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8679468035697937, "num_tokens": 756273077.0, "step": 19817 }, { "epoch": 2.5210533011067295, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.450103759765625, "learning_rate": 1e-06, "loss": 0.6674, "mean_token_accuracy": 0.8475077152252197, "num_tokens": 756313421.0, "step": 19818 }, { "epoch": 2.52118051138532, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 61.97364044189453, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8566985130310059, "num_tokens": 756356461.0, "step": 19819 }, { "epoch": 2.5213077216639106, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.12560272216797, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8575574159622192, "num_tokens": 756388309.0, "step": 19820 }, { "epoch": 2.521434931942501, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.0802116394043, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8727728128433228, "num_tokens": 756426655.0, "step": 19821 }, { "epoch": 2.521562142221091, "ewc_loss": 0.189453125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001659393310546875, "grad_norm": 48.76513671875, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8584566116333008, "num_tokens": 756464961.0, "step": 19822 }, { "epoch": 2.521689352499682, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.757869720458984, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8764159679412842, "num_tokens": 756497338.0, "step": 19823 }, { "epoch": 2.5218165627782723, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 49.516414642333984, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8692795634269714, "num_tokens": 756535324.0, "step": 19824 }, { "epoch": 2.5219437730568632, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.735992431640625, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8719130754470825, "num_tokens": 756574138.0, "step": 19825 }, { "epoch": 2.5220709833354533, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.108394622802734, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8733392953872681, "num_tokens": 756608967.0, "step": 19826 }, { "epoch": 2.522198193614044, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.31794357299805, "learning_rate": 1e-06, "loss": 0.6975, "mean_token_accuracy": 0.8409625291824341, "num_tokens": 756653882.0, "step": 19827 }, { "epoch": 2.5223254038926344, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.2887077331543, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8554047346115112, "num_tokens": 756695638.0, "step": 19828 }, { "epoch": 2.522452614171225, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.28464889526367, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8530865907669067, "num_tokens": 756734964.0, "step": 19829 }, { "epoch": 2.5225798244498154, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.237030029296875, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8663556575775146, "num_tokens": 756768454.0, "step": 19830 }, { "epoch": 2.522707034728406, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.539833068847656, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8505450487136841, "num_tokens": 756801696.0, "step": 19831 }, { "epoch": 2.5228342450069965, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.08249282836914, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8645815849304199, "num_tokens": 756839874.0, "step": 19832 }, { "epoch": 2.522961455285587, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.450870513916016, "learning_rate": 1e-06, "loss": 0.6428, "mean_token_accuracy": 0.8515149354934692, "num_tokens": 756884179.0, "step": 19833 }, { "epoch": 2.5230886655641775, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.85928726196289, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8627423644065857, "num_tokens": 756922725.0, "step": 19834 }, { "epoch": 2.523215875842768, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.99227523803711, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8565375208854675, "num_tokens": 756956080.0, "step": 19835 }, { "epoch": 2.5233430861213586, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.760868072509766, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8521144986152649, "num_tokens": 756995137.0, "step": 19836 }, { "epoch": 2.523470296399949, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.10845184326172, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8670168519020081, "num_tokens": 757035300.0, "step": 19837 }, { "epoch": 2.5235975066785397, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.323211669921875, "learning_rate": 1e-06, "loss": 0.6876, "mean_token_accuracy": 0.8364423513412476, "num_tokens": 757068789.0, "step": 19838 }, { "epoch": 2.52372471695713, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.25059509277344, "learning_rate": 1e-06, "loss": 0.6452, "mean_token_accuracy": 0.8555182218551636, "num_tokens": 757102913.0, "step": 19839 }, { "epoch": 2.5238519272357207, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.00818634033203, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8630808591842651, "num_tokens": 757138932.0, "step": 19840 }, { "epoch": 2.5239791375143112, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.14997100830078, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8656011819839478, "num_tokens": 757179181.0, "step": 19841 }, { "epoch": 2.5241063477929018, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.912750244140625, "learning_rate": 1e-06, "loss": 0.7005, "mean_token_accuracy": 0.8308809995651245, "num_tokens": 757224209.0, "step": 19842 }, { "epoch": 2.5242335580714923, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.818992614746094, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8755664825439453, "num_tokens": 757262398.0, "step": 19843 }, { "epoch": 2.524360768350083, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.35033416748047, "learning_rate": 1e-06, "loss": 0.6381, "mean_token_accuracy": 0.8517791032791138, "num_tokens": 757301744.0, "step": 19844 }, { "epoch": 2.5244879786286734, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.08484649658203, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.8657939434051514, "num_tokens": 757338273.0, "step": 19845 }, { "epoch": 2.524615188907264, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.494937896728516, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8542135953903198, "num_tokens": 757383169.0, "step": 19846 }, { "epoch": 2.524742399185854, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.63715744018555, "learning_rate": 1e-06, "loss": 0.6518, "mean_token_accuracy": 0.8518498539924622, "num_tokens": 757420998.0, "step": 19847 }, { "epoch": 2.524869609464445, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.70558547973633, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8666427135467529, "num_tokens": 757463281.0, "step": 19848 }, { "epoch": 2.524996819743035, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.03840637207031, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.85601806640625, "num_tokens": 757495022.0, "step": 19849 }, { "epoch": 2.525124030021626, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.545562744140625, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8674651980400085, "num_tokens": 757536801.0, "step": 19850 }, { "epoch": 2.525251240300216, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.870689392089844, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8766205310821533, "num_tokens": 757573419.0, "step": 19851 }, { "epoch": 2.5253784505788066, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.83955001831055, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8558319211006165, "num_tokens": 757610773.0, "step": 19852 }, { "epoch": 2.525505660857397, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.74525451660156, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8724719882011414, "num_tokens": 757648067.0, "step": 19853 }, { "epoch": 2.5256328711359877, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.11811828613281, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8679660558700562, "num_tokens": 757681598.0, "step": 19854 }, { "epoch": 2.525760081414578, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.387428283691406, "learning_rate": 1e-06, "loss": 0.694, "mean_token_accuracy": 0.8423059582710266, "num_tokens": 757724752.0, "step": 19855 }, { "epoch": 2.5258872916931687, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.762603759765625, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8724935054779053, "num_tokens": 757763801.0, "step": 19856 }, { "epoch": 2.5260145019717593, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.20746612548828, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8598482608795166, "num_tokens": 757799235.0, "step": 19857 }, { "epoch": 2.52614171225035, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.62686538696289, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.865168571472168, "num_tokens": 757839695.0, "step": 19858 }, { "epoch": 2.5262689225289403, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.43889236450195, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8497622013092041, "num_tokens": 757877152.0, "step": 19859 }, { "epoch": 2.526396132807531, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.0494384765625, "learning_rate": 1e-06, "loss": 0.6183, "mean_token_accuracy": 0.8615931272506714, "num_tokens": 757916173.0, "step": 19860 }, { "epoch": 2.5265233430861214, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.757240295410156, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8681474924087524, "num_tokens": 757957271.0, "step": 19861 }, { "epoch": 2.526650553364712, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.850276947021484, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8548603057861328, "num_tokens": 757995553.0, "step": 19862 }, { "epoch": 2.5267777636433024, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.128211975097656, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8505253195762634, "num_tokens": 758035095.0, "step": 19863 }, { "epoch": 2.526904973921893, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.80143737792969, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8604738712310791, "num_tokens": 758071185.0, "step": 19864 }, { "epoch": 2.5270321842004835, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.85292434692383, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8644443154335022, "num_tokens": 758110251.0, "step": 19865 }, { "epoch": 2.527159394479074, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.92942810058594, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.846243143081665, "num_tokens": 758144385.0, "step": 19866 }, { "epoch": 2.5272866047576645, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.579627990722656, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8728829622268677, "num_tokens": 758185935.0, "step": 19867 }, { "epoch": 2.527413815036255, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.082984924316406, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8662315607070923, "num_tokens": 758226688.0, "step": 19868 }, { "epoch": 2.5275410253148456, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.90190887451172, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.866889476776123, "num_tokens": 758263866.0, "step": 19869 }, { "epoch": 2.5276682355934357, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.9404411315918, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8685365319252014, "num_tokens": 758303607.0, "step": 19870 }, { "epoch": 2.5277954458720266, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.29042053222656, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8651278614997864, "num_tokens": 758340120.0, "step": 19871 }, { "epoch": 2.5279226561506167, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.91820526123047, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8589683771133423, "num_tokens": 758373792.0, "step": 19872 }, { "epoch": 2.5280498664292077, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.397796630859375, "learning_rate": 1e-06, "loss": 0.6522, "mean_token_accuracy": 0.8507766723632812, "num_tokens": 758418157.0, "step": 19873 }, { "epoch": 2.528177076707798, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.35779571533203, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8538317680358887, "num_tokens": 758454042.0, "step": 19874 }, { "epoch": 2.5283042869863888, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.94774627685547, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8644647002220154, "num_tokens": 758495305.0, "step": 19875 }, { "epoch": 2.528431497264979, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.300907135009766, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8689256906509399, "num_tokens": 758532646.0, "step": 19876 }, { "epoch": 2.5285587075435694, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.24219512939453, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.855952799320221, "num_tokens": 758568329.0, "step": 19877 }, { "epoch": 2.52868591782216, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.51024627685547, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.865800678730011, "num_tokens": 758608246.0, "step": 19878 }, { "epoch": 2.5288131281007504, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.79726028442383, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8715752363204956, "num_tokens": 758645356.0, "step": 19879 }, { "epoch": 2.528940338379341, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.35009002685547, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8775062561035156, "num_tokens": 758685657.0, "step": 19880 }, { "epoch": 2.5290675486579315, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.29207992553711, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8668392896652222, "num_tokens": 758718843.0, "step": 19881 }, { "epoch": 2.529194758936522, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.299110412597656, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.866669237613678, "num_tokens": 758757946.0, "step": 19882 }, { "epoch": 2.5293219692151125, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.33130645751953, "learning_rate": 1e-06, "loss": 0.6777, "mean_token_accuracy": 0.8419238328933716, "num_tokens": 758792832.0, "step": 19883 }, { "epoch": 2.529449179493703, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.221763610839844, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.8535799980163574, "num_tokens": 758826062.0, "step": 19884 }, { "epoch": 2.5295763897722936, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.115577697753906, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.8714751601219177, "num_tokens": 758860940.0, "step": 19885 }, { "epoch": 2.529703600050884, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.54878616333008, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8600112199783325, "num_tokens": 758900740.0, "step": 19886 }, { "epoch": 2.5298308103294747, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.96607971191406, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8588356971740723, "num_tokens": 758942786.0, "step": 19887 }, { "epoch": 2.529958020608065, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.47343826293945, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8703224062919617, "num_tokens": 758983376.0, "step": 19888 }, { "epoch": 2.5300852308866557, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.85564422607422, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.8506612777709961, "num_tokens": 759024007.0, "step": 19889 }, { "epoch": 2.5302124411652462, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.1642951965332, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8711380958557129, "num_tokens": 759061611.0, "step": 19890 }, { "epoch": 2.5303396514438368, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.25743865966797, "learning_rate": 1e-06, "loss": 0.6594, "mean_token_accuracy": 0.8440650701522827, "num_tokens": 759101942.0, "step": 19891 }, { "epoch": 2.5304668617224273, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.97728729248047, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8779555559158325, "num_tokens": 759140281.0, "step": 19892 }, { "epoch": 2.530594072001018, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.61063766479492, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8613259196281433, "num_tokens": 759174319.0, "step": 19893 }, { "epoch": 2.5307212822796084, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.530696868896484, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8679181337356567, "num_tokens": 759208981.0, "step": 19894 }, { "epoch": 2.5308484925581984, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.8675537109375, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8635162115097046, "num_tokens": 759240559.0, "step": 19895 }, { "epoch": 2.5309757028367894, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.85453414916992, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8736780881881714, "num_tokens": 759280550.0, "step": 19896 }, { "epoch": 2.5311029131153795, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.57309341430664, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8551763892173767, "num_tokens": 759321298.0, "step": 19897 }, { "epoch": 2.5312301233939705, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.42136001586914, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.8742720484733582, "num_tokens": 759354627.0, "step": 19898 }, { "epoch": 2.5313573336725606, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.26540756225586, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8568156957626343, "num_tokens": 759388129.0, "step": 19899 }, { "epoch": 2.5314845439511515, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.41117858886719, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8708893656730652, "num_tokens": 759423667.0, "step": 19900 }, { "epoch": 2.5316117542297416, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.602577209472656, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8526008725166321, "num_tokens": 759466586.0, "step": 19901 }, { "epoch": 2.531738964508332, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.18538284301758, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8572229743003845, "num_tokens": 759504214.0, "step": 19902 }, { "epoch": 2.5318661747869227, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.72176742553711, "learning_rate": 1e-06, "loss": 0.6707, "mean_token_accuracy": 0.8470406532287598, "num_tokens": 759543865.0, "step": 19903 }, { "epoch": 2.531993385065513, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.990543365478516, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.8575565218925476, "num_tokens": 759581917.0, "step": 19904 }, { "epoch": 2.5321205953441037, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.62624740600586, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8648660182952881, "num_tokens": 759617641.0, "step": 19905 }, { "epoch": 2.5322478056226942, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.488502502441406, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8731666207313538, "num_tokens": 759649097.0, "step": 19906 }, { "epoch": 2.5323750159012848, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.356361389160156, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.857181191444397, "num_tokens": 759688802.0, "step": 19907 }, { "epoch": 2.5325022261798753, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.99235916137695, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8708019256591797, "num_tokens": 759724243.0, "step": 19908 }, { "epoch": 2.532629436458466, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.6060905456543, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8548383116722107, "num_tokens": 759764646.0, "step": 19909 }, { "epoch": 2.5327566467370564, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.2675895690918, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.864609956741333, "num_tokens": 759806939.0, "step": 19910 }, { "epoch": 2.532883857015647, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.32788848876953, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8653074502944946, "num_tokens": 759848138.0, "step": 19911 }, { "epoch": 2.5330110672942374, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.7142448425293, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8595897555351257, "num_tokens": 759881877.0, "step": 19912 }, { "epoch": 2.533138277572828, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.16518783569336, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8733808994293213, "num_tokens": 759922625.0, "step": 19913 }, { "epoch": 2.5332654878514185, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.54878234863281, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8730486035346985, "num_tokens": 759952429.0, "step": 19914 }, { "epoch": 2.533392698130009, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.42171859741211, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8642299175262451, "num_tokens": 759987678.0, "step": 19915 }, { "epoch": 2.5335199084085995, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.57426834106445, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8472336530685425, "num_tokens": 760023943.0, "step": 19916 }, { "epoch": 2.53364711868719, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.76620864868164, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8659560084342957, "num_tokens": 760068229.0, "step": 19917 }, { "epoch": 2.5337743289657806, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.85432815551758, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.8491314053535461, "num_tokens": 760109440.0, "step": 19918 }, { "epoch": 2.533901539244371, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.69230651855469, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8621613383293152, "num_tokens": 760146616.0, "step": 19919 }, { "epoch": 2.534028749522961, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.57780075073242, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.8434345126152039, "num_tokens": 760188708.0, "step": 19920 }, { "epoch": 2.534155959801552, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.872859954833984, "learning_rate": 1e-06, "loss": 0.567, "mean_token_accuracy": 0.8787586688995361, "num_tokens": 760231355.0, "step": 19921 }, { "epoch": 2.5342831700801423, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.33197784423828, "learning_rate": 1e-06, "loss": 0.6537, "mean_token_accuracy": 0.8479628562927246, "num_tokens": 760268316.0, "step": 19922 }, { "epoch": 2.5344103803587332, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.60885238647461, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8521273732185364, "num_tokens": 760305420.0, "step": 19923 }, { "epoch": 2.5345375906373233, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.66927719116211, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8812503814697266, "num_tokens": 760342195.0, "step": 19924 }, { "epoch": 2.534664800915914, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.27203369140625, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8758314847946167, "num_tokens": 760375394.0, "step": 19925 }, { "epoch": 2.5347920111945044, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.816463470458984, "learning_rate": 1e-06, "loss": 0.6479, "mean_token_accuracy": 0.8505223989486694, "num_tokens": 760411629.0, "step": 19926 }, { "epoch": 2.534919221473095, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 48.96017074584961, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8638805747032166, "num_tokens": 760456212.0, "step": 19927 }, { "epoch": 2.5350464317516854, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.87659454345703, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8536626100540161, "num_tokens": 760496385.0, "step": 19928 }, { "epoch": 2.535173642030276, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.13420867919922, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8613221645355225, "num_tokens": 760530426.0, "step": 19929 }, { "epoch": 2.5353008523088665, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.657371520996094, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8745155334472656, "num_tokens": 760568174.0, "step": 19930 }, { "epoch": 2.535428062587457, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.94933319091797, "learning_rate": 1e-06, "loss": 0.5605, "mean_token_accuracy": 0.88066166639328, "num_tokens": 760608097.0, "step": 19931 }, { "epoch": 2.5355552728660475, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.375492095947266, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8617279529571533, "num_tokens": 760643018.0, "step": 19932 }, { "epoch": 2.535682483144638, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.34193801879883, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8696947693824768, "num_tokens": 760676553.0, "step": 19933 }, { "epoch": 2.5358096934232286, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.96552276611328, "learning_rate": 1e-06, "loss": 0.5618, "mean_token_accuracy": 0.8766300678253174, "num_tokens": 760708850.0, "step": 19934 }, { "epoch": 2.535936903701819, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.04307556152344, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8574776649475098, "num_tokens": 760746088.0, "step": 19935 }, { "epoch": 2.5360641139804097, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.7425537109375, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8679483532905579, "num_tokens": 760785716.0, "step": 19936 }, { "epoch": 2.536191324259, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.8904914855957, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8697789907455444, "num_tokens": 760821301.0, "step": 19937 }, { "epoch": 2.5363185345375907, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.67620086669922, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8526872396469116, "num_tokens": 760861200.0, "step": 19938 }, { "epoch": 2.5364457448161812, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.339111328125, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8592429161071777, "num_tokens": 760896398.0, "step": 19939 }, { "epoch": 2.5365729550947718, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.214176177978516, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8414997458457947, "num_tokens": 760937300.0, "step": 19940 }, { "epoch": 2.5367001653733623, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.505027770996094, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8551993370056152, "num_tokens": 760973726.0, "step": 19941 }, { "epoch": 2.536827375651953, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.399085998535156, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8594912886619568, "num_tokens": 761009048.0, "step": 19942 }, { "epoch": 2.5369545859305433, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.450706481933594, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8596162796020508, "num_tokens": 761050836.0, "step": 19943 }, { "epoch": 2.537081796209134, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.31523132324219, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.8522554636001587, "num_tokens": 761087517.0, "step": 19944 }, { "epoch": 2.537209006487724, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.865638732910156, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8759960532188416, "num_tokens": 761121401.0, "step": 19945 }, { "epoch": 2.537336216766315, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.969093322753906, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.850481390953064, "num_tokens": 761154973.0, "step": 19946 }, { "epoch": 2.537463427044905, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.70878219604492, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8702409267425537, "num_tokens": 761194211.0, "step": 19947 }, { "epoch": 2.537590637323496, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.00850296020508, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8555849194526672, "num_tokens": 761233486.0, "step": 19948 }, { "epoch": 2.537717847602086, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.56378173828125, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8620762825012207, "num_tokens": 761275846.0, "step": 19949 }, { "epoch": 2.5378450578806766, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.0984001159668, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8527286052703857, "num_tokens": 761311210.0, "step": 19950 }, { "epoch": 2.537972268159267, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.58735275268555, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8629367351531982, "num_tokens": 761349961.0, "step": 19951 }, { "epoch": 2.5380994784378577, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.9378547668457, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8689553737640381, "num_tokens": 761386993.0, "step": 19952 }, { "epoch": 2.538226688716448, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.59141540527344, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8522129654884338, "num_tokens": 761426910.0, "step": 19953 }, { "epoch": 2.5383538989950387, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.800296783447266, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8657662868499756, "num_tokens": 761466633.0, "step": 19954 }, { "epoch": 2.5384811092736292, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.076168060302734, "learning_rate": 1e-06, "loss": 0.6514, "mean_token_accuracy": 0.8477303981781006, "num_tokens": 761497670.0, "step": 19955 }, { "epoch": 2.5386083195522198, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.85737991333008, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8587830662727356, "num_tokens": 761534626.0, "step": 19956 }, { "epoch": 2.5387355298308103, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.82480239868164, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8699350953102112, "num_tokens": 761581125.0, "step": 19957 }, { "epoch": 2.538862740109401, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.56370162963867, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8618398904800415, "num_tokens": 761616533.0, "step": 19958 }, { "epoch": 2.5389899503879914, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.12158966064453, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8520126342773438, "num_tokens": 761650615.0, "step": 19959 }, { "epoch": 2.539117160666582, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.290504455566406, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.873424232006073, "num_tokens": 761687020.0, "step": 19960 }, { "epoch": 2.5392443709451724, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.19210433959961, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.8763489723205566, "num_tokens": 761721653.0, "step": 19961 }, { "epoch": 2.539371581223763, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.5433349609375, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8695072531700134, "num_tokens": 761758377.0, "step": 19962 }, { "epoch": 2.5394987915023535, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.514713287353516, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8612309694290161, "num_tokens": 761795263.0, "step": 19963 }, { "epoch": 2.539626001780944, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.97005844116211, "learning_rate": 1e-06, "loss": 0.5673, "mean_token_accuracy": 0.8741244673728943, "num_tokens": 761834758.0, "step": 19964 }, { "epoch": 2.5397532120595345, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.93722915649414, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8551151752471924, "num_tokens": 761880482.0, "step": 19965 }, { "epoch": 2.539880422338125, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.66642761230469, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8619639873504639, "num_tokens": 761922494.0, "step": 19966 }, { "epoch": 2.5400076326167156, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.0183219909668, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8803246021270752, "num_tokens": 761959462.0, "step": 19967 }, { "epoch": 2.5401348428953057, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.59346008300781, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8517228364944458, "num_tokens": 761990316.0, "step": 19968 }, { "epoch": 2.5402620531738966, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.30085754394531, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8577392101287842, "num_tokens": 762023671.0, "step": 19969 }, { "epoch": 2.5403892634524867, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.94708251953125, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8679782152175903, "num_tokens": 762056719.0, "step": 19970 }, { "epoch": 2.5405164737310777, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.40760040283203, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8630416393280029, "num_tokens": 762098802.0, "step": 19971 }, { "epoch": 2.540643684009668, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.738136291503906, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8660366535186768, "num_tokens": 762134470.0, "step": 19972 }, { "epoch": 2.5407708942882588, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.02273178100586, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.863788366317749, "num_tokens": 762177163.0, "step": 19973 }, { "epoch": 2.540898104566849, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.36326599121094, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8704172372817993, "num_tokens": 762217323.0, "step": 19974 }, { "epoch": 2.5410253148454394, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.50667190551758, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.857572615146637, "num_tokens": 762252144.0, "step": 19975 }, { "epoch": 2.54115252512403, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.021419525146484, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8547559976577759, "num_tokens": 762282108.0, "step": 19976 }, { "epoch": 2.5412797354026204, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.77030563354492, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.865684986114502, "num_tokens": 762318154.0, "step": 19977 }, { "epoch": 2.541406945681211, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.13826370239258, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8604518175125122, "num_tokens": 762354704.0, "step": 19978 }, { "epoch": 2.5415341559598015, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.2352409362793, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8641058802604675, "num_tokens": 762389943.0, "step": 19979 }, { "epoch": 2.541661366238392, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.16929626464844, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8640651106834412, "num_tokens": 762421560.0, "step": 19980 }, { "epoch": 2.5417885765169825, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.77859115600586, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8672694563865662, "num_tokens": 762458509.0, "step": 19981 }, { "epoch": 2.541915786795573, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 52.01484680175781, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8608829379081726, "num_tokens": 762502382.0, "step": 19982 }, { "epoch": 2.5420429970741636, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 49.89252853393555, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8584073781967163, "num_tokens": 762536611.0, "step": 19983 }, { "epoch": 2.542170207352754, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.410850524902344, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.863446831703186, "num_tokens": 762577801.0, "step": 19984 }, { "epoch": 2.5422974176313446, "ewc_loss": 0.1962890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017261505126953125, "grad_norm": 50.057594299316406, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8638671040534973, "num_tokens": 762617413.0, "step": 19985 }, { "epoch": 2.542424627909935, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.47884750366211, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8725126385688782, "num_tokens": 762654811.0, "step": 19986 }, { "epoch": 2.5425518381885257, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.82567596435547, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8671274185180664, "num_tokens": 762692902.0, "step": 19987 }, { "epoch": 2.5426790484671162, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.490726470947266, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8636856079101562, "num_tokens": 762731984.0, "step": 19988 }, { "epoch": 2.5428062587457068, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.55083084106445, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8708797693252563, "num_tokens": 762767104.0, "step": 19989 }, { "epoch": 2.5429334690242973, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.42363739013672, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8500703573226929, "num_tokens": 762804982.0, "step": 19990 }, { "epoch": 2.543060679302888, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.684776306152344, "learning_rate": 1e-06, "loss": 0.6326, "mean_token_accuracy": 0.8533356189727783, "num_tokens": 762842956.0, "step": 19991 }, { "epoch": 2.5431878895814783, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.444480895996094, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8694651126861572, "num_tokens": 762875445.0, "step": 19992 }, { "epoch": 2.5433150998600684, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.43754577636719, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.8759792447090149, "num_tokens": 762906061.0, "step": 19993 }, { "epoch": 2.5434423101386594, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.68980407714844, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8717879056930542, "num_tokens": 762945550.0, "step": 19994 }, { "epoch": 2.5435695204172495, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 51.042659759521484, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8621114492416382, "num_tokens": 762982586.0, "step": 19995 }, { "epoch": 2.5436967306958405, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.51428985595703, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.862480878829956, "num_tokens": 763013527.0, "step": 19996 }, { "epoch": 2.5438239409744305, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.26683807373047, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8551068305969238, "num_tokens": 763054363.0, "step": 19997 }, { "epoch": 2.5439511512530215, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.334373474121094, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8591834306716919, "num_tokens": 763096834.0, "step": 19998 }, { "epoch": 2.5440783615316116, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.971370697021484, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8624218702316284, "num_tokens": 763139611.0, "step": 19999 }, { "epoch": 2.544205571810202, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 52.075286865234375, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8638108968734741, "num_tokens": 763181085.0, "step": 20000 }, { "epoch": 2.5443327820887927, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.67876052856445, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8608347773551941, "num_tokens": 763219811.0, "step": 20001 }, { "epoch": 2.544459992367383, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.6318473815918, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8517709970474243, "num_tokens": 763257745.0, "step": 20002 }, { "epoch": 2.5445872026459737, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 50.62957000732422, "learning_rate": 1e-06, "loss": 0.6446, "mean_token_accuracy": 0.8473128080368042, "num_tokens": 763299475.0, "step": 20003 }, { "epoch": 2.5447144129245642, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.11860656738281, "learning_rate": 1e-06, "loss": 0.6726, "mean_token_accuracy": 0.8392535448074341, "num_tokens": 763333960.0, "step": 20004 }, { "epoch": 2.5448416232031548, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.82123947143555, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8547950983047485, "num_tokens": 763378646.0, "step": 20005 }, { "epoch": 2.5449688334817453, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.375667572021484, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8670046329498291, "num_tokens": 763412441.0, "step": 20006 }, { "epoch": 2.545096043760336, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 51.312774658203125, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8660694360733032, "num_tokens": 763446713.0, "step": 20007 }, { "epoch": 2.5452232540389264, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.506778717041016, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8724359273910522, "num_tokens": 763478686.0, "step": 20008 }, { "epoch": 2.545350464317517, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.1265869140625, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8777711391448975, "num_tokens": 763517258.0, "step": 20009 }, { "epoch": 2.5454776745961074, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.4911003112793, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8570550680160522, "num_tokens": 763553229.0, "step": 20010 }, { "epoch": 2.545604884874698, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.02113723754883, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.8700422048568726, "num_tokens": 763594274.0, "step": 20011 }, { "epoch": 2.5457320951532885, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 51.24412155151367, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8542433381080627, "num_tokens": 763632162.0, "step": 20012 }, { "epoch": 2.545859305431879, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.80267333984375, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8648453950881958, "num_tokens": 763666797.0, "step": 20013 }, { "epoch": 2.5459865157104695, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 51.30571365356445, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8589503169059753, "num_tokens": 763705475.0, "step": 20014 }, { "epoch": 2.54611372598906, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 51.19032669067383, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8787062764167786, "num_tokens": 763739434.0, "step": 20015 }, { "epoch": 2.5462409362676506, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 51.64357376098633, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8493109941482544, "num_tokens": 763773262.0, "step": 20016 }, { "epoch": 2.546368146546241, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.69820022583008, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8529454469680786, "num_tokens": 763812573.0, "step": 20017 }, { "epoch": 2.546495356824831, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 51.734840393066406, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8644477725028992, "num_tokens": 763848555.0, "step": 20018 }, { "epoch": 2.546622567103422, "ewc_loss": 0.1953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000171661376953125, "grad_norm": 50.376373291015625, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8597647547721863, "num_tokens": 763884608.0, "step": 20019 }, { "epoch": 2.5467497773820122, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.75362777709961, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8565882444381714, "num_tokens": 763922038.0, "step": 20020 }, { "epoch": 2.546876987660603, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.7465705871582, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8597473502159119, "num_tokens": 763958089.0, "step": 20021 }, { "epoch": 2.5470041979391933, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.645931243896484, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8637418150901794, "num_tokens": 764000802.0, "step": 20022 }, { "epoch": 2.547131408217784, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.78866195678711, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.866126298904419, "num_tokens": 764036628.0, "step": 20023 }, { "epoch": 2.5472586184963744, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.987335205078125, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8657681941986084, "num_tokens": 764073643.0, "step": 20024 }, { "epoch": 2.547385828774965, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.86368179321289, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8672806620597839, "num_tokens": 764110473.0, "step": 20025 }, { "epoch": 2.5475130390535554, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.60638427734375, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8597344160079956, "num_tokens": 764142941.0, "step": 20026 }, { "epoch": 2.547640249332146, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 51.2627067565918, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8650215864181519, "num_tokens": 764178752.0, "step": 20027 }, { "epoch": 2.5477674596107365, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.425987243652344, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8590418100357056, "num_tokens": 764217440.0, "step": 20028 }, { "epoch": 2.547894669889327, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.18954849243164, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8515465259552002, "num_tokens": 764257524.0, "step": 20029 }, { "epoch": 2.5480218801679175, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.469215393066406, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8567532896995544, "num_tokens": 764293645.0, "step": 20030 }, { "epoch": 2.548149090446508, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.776458740234375, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8694246411323547, "num_tokens": 764331660.0, "step": 20031 }, { "epoch": 2.5482763007250986, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 50.304954528808594, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8821452856063843, "num_tokens": 764376577.0, "step": 20032 }, { "epoch": 2.548403511003689, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.25156784057617, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8615626692771912, "num_tokens": 764415492.0, "step": 20033 }, { "epoch": 2.5485307212822796, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.38718795776367, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8517614006996155, "num_tokens": 764453948.0, "step": 20034 }, { "epoch": 2.54865793156087, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.42373275756836, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8674190044403076, "num_tokens": 764493906.0, "step": 20035 }, { "epoch": 2.5487851418394607, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 51.10485076904297, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8563458919525146, "num_tokens": 764532379.0, "step": 20036 }, { "epoch": 2.5489123521180512, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.76048278808594, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.8427692651748657, "num_tokens": 764567726.0, "step": 20037 }, { "epoch": 2.5490395623966418, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.15583419799805, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8633113503456116, "num_tokens": 764606229.0, "step": 20038 }, { "epoch": 2.5491667726752323, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.928524017333984, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8740764856338501, "num_tokens": 764641346.0, "step": 20039 }, { "epoch": 2.549293982953823, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.09394073486328, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8622242212295532, "num_tokens": 764687286.0, "step": 20040 }, { "epoch": 2.5494211932324133, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.041500091552734, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8658055663108826, "num_tokens": 764729235.0, "step": 20041 }, { "epoch": 2.549548403511004, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 51.369903564453125, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8552939891815186, "num_tokens": 764771834.0, "step": 20042 }, { "epoch": 2.549675613789594, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 50.71532440185547, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.861809253692627, "num_tokens": 764808307.0, "step": 20043 }, { "epoch": 2.549802824068185, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.09445571899414, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8559280633926392, "num_tokens": 764848373.0, "step": 20044 }, { "epoch": 2.549930034346775, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.54222869873047, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8582238554954529, "num_tokens": 764886468.0, "step": 20045 }, { "epoch": 2.550057244625366, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.2727165222168, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8647758364677429, "num_tokens": 764928690.0, "step": 20046 }, { "epoch": 2.550184454903956, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.34410858154297, "learning_rate": 1e-06, "loss": 0.7217, "mean_token_accuracy": 0.8276287317276001, "num_tokens": 764969148.0, "step": 20047 }, { "epoch": 2.5503116651825466, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.2745475769043, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8574591875076294, "num_tokens": 765010754.0, "step": 20048 }, { "epoch": 2.550438875461137, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.33652877807617, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8659734725952148, "num_tokens": 765043562.0, "step": 20049 }, { "epoch": 2.5505660857397277, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.45060348510742, "learning_rate": 1e-06, "loss": 0.6744, "mean_token_accuracy": 0.843358039855957, "num_tokens": 765085638.0, "step": 20050 }, { "epoch": 2.550693296018318, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.46076583862305, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8676274418830872, "num_tokens": 765115847.0, "step": 20051 }, { "epoch": 2.5508205062969087, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.61384582519531, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8613171577453613, "num_tokens": 765146839.0, "step": 20052 }, { "epoch": 2.5509477165754992, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.645904541015625, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8618052005767822, "num_tokens": 765185921.0, "step": 20053 }, { "epoch": 2.5510749268540898, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.310707092285156, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8605576753616333, "num_tokens": 765224534.0, "step": 20054 }, { "epoch": 2.5512021371326803, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.262664794921875, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8727650046348572, "num_tokens": 765267160.0, "step": 20055 }, { "epoch": 2.551329347411271, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.41672134399414, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8629956841468811, "num_tokens": 765304770.0, "step": 20056 }, { "epoch": 2.5514565576898613, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.302215576171875, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8583434820175171, "num_tokens": 765341966.0, "step": 20057 }, { "epoch": 2.551583767968452, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.189517974853516, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8788087964057922, "num_tokens": 765382653.0, "step": 20058 }, { "epoch": 2.5517109782470424, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.29745864868164, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8591436743736267, "num_tokens": 765414940.0, "step": 20059 }, { "epoch": 2.551838188525633, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.04008102416992, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8629582524299622, "num_tokens": 765445549.0, "step": 20060 }, { "epoch": 2.5519653988042235, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.518978118896484, "learning_rate": 1e-06, "loss": 0.5919, "mean_token_accuracy": 0.8702771663665771, "num_tokens": 765484474.0, "step": 20061 }, { "epoch": 2.552092609082814, "ewc_loss": 0.197265625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001735687255859375, "grad_norm": 49.90666198730469, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8627773523330688, "num_tokens": 765522651.0, "step": 20062 }, { "epoch": 2.5522198193614045, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.090301513671875, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8586550354957581, "num_tokens": 765565983.0, "step": 20063 }, { "epoch": 2.552347029639995, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 50.06941604614258, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8627093434333801, "num_tokens": 765604056.0, "step": 20064 }, { "epoch": 2.5524742399185856, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.875457763671875, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8705357909202576, "num_tokens": 765646062.0, "step": 20065 }, { "epoch": 2.5526014501971757, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.156890869140625, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8595675826072693, "num_tokens": 765687284.0, "step": 20066 }, { "epoch": 2.5527286604757666, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.80752182006836, "learning_rate": 1e-06, "loss": 0.6459, "mean_token_accuracy": 0.8497470617294312, "num_tokens": 765723743.0, "step": 20067 }, { "epoch": 2.5528558707543567, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.25580596923828, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8597468137741089, "num_tokens": 765768472.0, "step": 20068 }, { "epoch": 2.5529830810329477, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.056129455566406, "learning_rate": 1e-06, "loss": 0.5595, "mean_token_accuracy": 0.8791749477386475, "num_tokens": 765809040.0, "step": 20069 }, { "epoch": 2.5531102913115378, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.368770599365234, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8639693260192871, "num_tokens": 765845968.0, "step": 20070 }, { "epoch": 2.5532375015901287, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.778011322021484, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8686163425445557, "num_tokens": 765881037.0, "step": 20071 }, { "epoch": 2.553364711868719, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.60853958129883, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8670861721038818, "num_tokens": 765918945.0, "step": 20072 }, { "epoch": 2.5534919221473094, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.77101516723633, "learning_rate": 1e-06, "loss": 0.557, "mean_token_accuracy": 0.874530017375946, "num_tokens": 765956128.0, "step": 20073 }, { "epoch": 2.5536191324259, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.84962844848633, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.8766826391220093, "num_tokens": 765997109.0, "step": 20074 }, { "epoch": 2.5537463427044904, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.11725616455078, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8588390350341797, "num_tokens": 766032557.0, "step": 20075 }, { "epoch": 2.553873552983081, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.59502029418945, "learning_rate": 1e-06, "loss": 0.631, "mean_token_accuracy": 0.8597354888916016, "num_tokens": 766075285.0, "step": 20076 }, { "epoch": 2.5540007632616715, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.96440887451172, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8604528903961182, "num_tokens": 766110600.0, "step": 20077 }, { "epoch": 2.554127973540262, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.457611083984375, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8503992557525635, "num_tokens": 766146877.0, "step": 20078 }, { "epoch": 2.5542551838188525, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.9513053894043, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8587579727172852, "num_tokens": 766185164.0, "step": 20079 }, { "epoch": 2.554382394097443, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.34455871582031, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.8833980560302734, "num_tokens": 766221442.0, "step": 20080 }, { "epoch": 2.5545096043760336, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.66365432739258, "learning_rate": 1e-06, "loss": 0.6912, "mean_token_accuracy": 0.8364977240562439, "num_tokens": 766255258.0, "step": 20081 }, { "epoch": 2.554636814654624, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.834407806396484, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8590795993804932, "num_tokens": 766292392.0, "step": 20082 }, { "epoch": 2.5547640249332146, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.35710144042969, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8532999753952026, "num_tokens": 766335312.0, "step": 20083 }, { "epoch": 2.554891235211805, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.18849182128906, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8636586666107178, "num_tokens": 766372189.0, "step": 20084 }, { "epoch": 2.5550184454903957, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.16960906982422, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8516507744789124, "num_tokens": 766414021.0, "step": 20085 }, { "epoch": 2.5551456557689862, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.38494110107422, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8752472400665283, "num_tokens": 766452005.0, "step": 20086 }, { "epoch": 2.5552728660475768, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.52169418334961, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8556720614433289, "num_tokens": 766492150.0, "step": 20087 }, { "epoch": 2.5554000763261673, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.11241912841797, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.873276948928833, "num_tokens": 766533889.0, "step": 20088 }, { "epoch": 2.555527286604758, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.96474838256836, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8620148301124573, "num_tokens": 766565008.0, "step": 20089 }, { "epoch": 2.5556544968833483, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.48649597167969, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8636768460273743, "num_tokens": 766605744.0, "step": 20090 }, { "epoch": 2.5557817071619384, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.75606918334961, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.872370183467865, "num_tokens": 766645782.0, "step": 20091 }, { "epoch": 2.5559089174405294, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.544857025146484, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8524225950241089, "num_tokens": 766683135.0, "step": 20092 }, { "epoch": 2.5560361277191195, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.449195861816406, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8629980683326721, "num_tokens": 766720691.0, "step": 20093 }, { "epoch": 2.5561633379977104, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.744346618652344, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8577213287353516, "num_tokens": 766755426.0, "step": 20094 }, { "epoch": 2.5562905482763005, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.43622970581055, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8682032823562622, "num_tokens": 766788315.0, "step": 20095 }, { "epoch": 2.5564177585548915, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.550228118896484, "learning_rate": 1e-06, "loss": 0.5613, "mean_token_accuracy": 0.8776755332946777, "num_tokens": 766825664.0, "step": 20096 }, { "epoch": 2.5565449688334816, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.287940979003906, "learning_rate": 1e-06, "loss": 0.5879, "mean_token_accuracy": 0.8741058111190796, "num_tokens": 766868145.0, "step": 20097 }, { "epoch": 2.556672179112072, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.405643463134766, "learning_rate": 1e-06, "loss": 0.6584, "mean_token_accuracy": 0.8509267568588257, "num_tokens": 766907278.0, "step": 20098 }, { "epoch": 2.5567993893906626, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.4828987121582, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8739237189292908, "num_tokens": 766941662.0, "step": 20099 }, { "epoch": 2.556926599669253, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.332176208496094, "learning_rate": 1e-06, "loss": 0.6844, "mean_token_accuracy": 0.8455931544303894, "num_tokens": 766977725.0, "step": 20100 }, { "epoch": 2.5570538099478437, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.51687240600586, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8556274175643921, "num_tokens": 767011011.0, "step": 20101 }, { "epoch": 2.5571810202264342, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.37392044067383, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8604555726051331, "num_tokens": 767047078.0, "step": 20102 }, { "epoch": 2.5573082305050248, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.518760681152344, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.858478307723999, "num_tokens": 767089208.0, "step": 20103 }, { "epoch": 2.5574354407836153, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.00395965576172, "learning_rate": 1e-06, "loss": 0.6711, "mean_token_accuracy": 0.8480322360992432, "num_tokens": 767131225.0, "step": 20104 }, { "epoch": 2.557562651062206, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.73713302612305, "learning_rate": 1e-06, "loss": 0.6441, "mean_token_accuracy": 0.8487952947616577, "num_tokens": 767165222.0, "step": 20105 }, { "epoch": 2.5576898613407963, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.428165435791016, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8635501861572266, "num_tokens": 767200870.0, "step": 20106 }, { "epoch": 2.557817071619387, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.87942123413086, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8593109846115112, "num_tokens": 767236881.0, "step": 20107 }, { "epoch": 2.5579442818979774, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.33244705200195, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8668069839477539, "num_tokens": 767269716.0, "step": 20108 }, { "epoch": 2.558071492176568, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.962669372558594, "learning_rate": 1e-06, "loss": 0.6441, "mean_token_accuracy": 0.8523798584938049, "num_tokens": 767306840.0, "step": 20109 }, { "epoch": 2.5581987024551585, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.50437545776367, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.874832034111023, "num_tokens": 767346966.0, "step": 20110 }, { "epoch": 2.558325912733749, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.49026107788086, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8594248294830322, "num_tokens": 767387084.0, "step": 20111 }, { "epoch": 2.5584531230123395, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.71607971191406, "learning_rate": 1e-06, "loss": 0.6695, "mean_token_accuracy": 0.8423094749450684, "num_tokens": 767417857.0, "step": 20112 }, { "epoch": 2.55858033329093, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.30246353149414, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8623714447021484, "num_tokens": 767455771.0, "step": 20113 }, { "epoch": 2.5587075435695206, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.561607360839844, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.8503195643424988, "num_tokens": 767493187.0, "step": 20114 }, { "epoch": 2.558834753848111, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.175567626953125, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8546296954154968, "num_tokens": 767534526.0, "step": 20115 }, { "epoch": 2.558961964126701, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.77419662475586, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.8592548966407776, "num_tokens": 767569297.0, "step": 20116 }, { "epoch": 2.559089174405292, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.3950080871582, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8577901124954224, "num_tokens": 767605860.0, "step": 20117 }, { "epoch": 2.5592163846838822, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.315616607666016, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8651262521743774, "num_tokens": 767645252.0, "step": 20118 }, { "epoch": 2.559343594962473, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.71585464477539, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8614702224731445, "num_tokens": 767681942.0, "step": 20119 }, { "epoch": 2.5594708052410633, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.296871185302734, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8627548217773438, "num_tokens": 767720631.0, "step": 20120 }, { "epoch": 2.559598015519654, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.33635330200195, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8671456575393677, "num_tokens": 767757018.0, "step": 20121 }, { "epoch": 2.5597252257982444, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.4244384765625, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.8647972345352173, "num_tokens": 767796046.0, "step": 20122 }, { "epoch": 2.559852436076835, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.12297058105469, "learning_rate": 1e-06, "loss": 0.5824, "mean_token_accuracy": 0.8728973865509033, "num_tokens": 767835678.0, "step": 20123 }, { "epoch": 2.5599796463554254, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.64282989501953, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8752455115318298, "num_tokens": 767870515.0, "step": 20124 }, { "epoch": 2.560106856634016, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.10431671142578, "learning_rate": 1e-06, "loss": 0.6626, "mean_token_accuracy": 0.8460706472396851, "num_tokens": 767911573.0, "step": 20125 }, { "epoch": 2.5602340669126065, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.74778366088867, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.860942006111145, "num_tokens": 767950161.0, "step": 20126 }, { "epoch": 2.560361277191197, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.51724624633789, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8597815036773682, "num_tokens": 767989431.0, "step": 20127 }, { "epoch": 2.5604884874697875, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.260215759277344, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.87067049741745, "num_tokens": 768023551.0, "step": 20128 }, { "epoch": 2.560615697748378, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.07164001464844, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8697676658630371, "num_tokens": 768055317.0, "step": 20129 }, { "epoch": 2.5607429080269686, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.96282958984375, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8543589115142822, "num_tokens": 768089575.0, "step": 20130 }, { "epoch": 2.560870118305559, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.85453414916992, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8529564142227173, "num_tokens": 768131795.0, "step": 20131 }, { "epoch": 2.5609973285841496, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.07773971557617, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8667742013931274, "num_tokens": 768165714.0, "step": 20132 }, { "epoch": 2.56112453886274, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.448787689208984, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8705528974533081, "num_tokens": 768203475.0, "step": 20133 }, { "epoch": 2.5612517491413307, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.530128479003906, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8795851469039917, "num_tokens": 768244841.0, "step": 20134 }, { "epoch": 2.561378959419921, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.95294952392578, "learning_rate": 1e-06, "loss": 0.6336, "mean_token_accuracy": 0.8585840463638306, "num_tokens": 768286502.0, "step": 20135 }, { "epoch": 2.5615061696985117, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.292388916015625, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8699966669082642, "num_tokens": 768320628.0, "step": 20136 }, { "epoch": 2.5616333799771023, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.77791976928711, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8542987704277039, "num_tokens": 768353630.0, "step": 20137 }, { "epoch": 2.561760590255693, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.550479888916016, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8605151176452637, "num_tokens": 768388170.0, "step": 20138 }, { "epoch": 2.5618878005342833, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.79899597167969, "learning_rate": 1e-06, "loss": 0.5577, "mean_token_accuracy": 0.8786494135856628, "num_tokens": 768428238.0, "step": 20139 }, { "epoch": 2.562015010812874, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.56330490112305, "learning_rate": 1e-06, "loss": 0.5625, "mean_token_accuracy": 0.876653254032135, "num_tokens": 768460075.0, "step": 20140 }, { "epoch": 2.562142221091464, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.44758224487305, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.8798775672912598, "num_tokens": 768493982.0, "step": 20141 }, { "epoch": 2.562269431370055, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.3451042175293, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8660591840744019, "num_tokens": 768529121.0, "step": 20142 }, { "epoch": 2.562396641648645, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.08189010620117, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8635612726211548, "num_tokens": 768560730.0, "step": 20143 }, { "epoch": 2.562523851927236, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.178245544433594, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8740124702453613, "num_tokens": 768603015.0, "step": 20144 }, { "epoch": 2.562651062205826, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.836891174316406, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8624745011329651, "num_tokens": 768639847.0, "step": 20145 }, { "epoch": 2.5627782724844166, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.288150787353516, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8598998785018921, "num_tokens": 768681614.0, "step": 20146 }, { "epoch": 2.562905482763007, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.84627151489258, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8656096458435059, "num_tokens": 768722052.0, "step": 20147 }, { "epoch": 2.5630326930415976, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.71604537963867, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8556928634643555, "num_tokens": 768754191.0, "step": 20148 }, { "epoch": 2.563159903320188, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.29908752441406, "learning_rate": 1e-06, "loss": 0.6483, "mean_token_accuracy": 0.8523703217506409, "num_tokens": 768792981.0, "step": 20149 }, { "epoch": 2.5632871135987787, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.76994705200195, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8513956069946289, "num_tokens": 768827040.0, "step": 20150 }, { "epoch": 2.5634143238773692, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.53506851196289, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8614414930343628, "num_tokens": 768865669.0, "step": 20151 }, { "epoch": 2.5635415341559598, "ewc_loss": 0.1982421875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017452239990234375, "grad_norm": 49.60616683959961, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8601217269897461, "num_tokens": 768903306.0, "step": 20152 }, { "epoch": 2.5636687444345503, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.471309661865234, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8659306168556213, "num_tokens": 768946322.0, "step": 20153 }, { "epoch": 2.563795954713141, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.532169342041016, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8802552223205566, "num_tokens": 768987350.0, "step": 20154 }, { "epoch": 2.5639231649917313, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.230228424072266, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8714037537574768, "num_tokens": 769030762.0, "step": 20155 }, { "epoch": 2.564050375270322, "ewc_loss": 0.19921875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017547607421875, "grad_norm": 49.5837287902832, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8747814893722534, "num_tokens": 769069827.0, "step": 20156 }, { "epoch": 2.5641775855489124, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.6671257019043, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8632464408874512, "num_tokens": 769107742.0, "step": 20157 }, { "epoch": 2.564304795827503, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.54877471923828, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8595196008682251, "num_tokens": 769153286.0, "step": 20158 }, { "epoch": 2.5644320061060935, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.82664108276367, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8645308017730713, "num_tokens": 769192646.0, "step": 20159 }, { "epoch": 2.564559216384684, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.721519470214844, "learning_rate": 1e-06, "loss": 0.6797, "mean_token_accuracy": 0.8408945798873901, "num_tokens": 769230360.0, "step": 20160 }, { "epoch": 2.5646864266632745, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.62236404418945, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8766383528709412, "num_tokens": 769270754.0, "step": 20161 }, { "epoch": 2.564813636941865, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.06376647949219, "learning_rate": 1e-06, "loss": 0.6786, "mean_token_accuracy": 0.8398873805999756, "num_tokens": 769310109.0, "step": 20162 }, { "epoch": 2.5649408472204556, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 48.97286605834961, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.861107587814331, "num_tokens": 769352101.0, "step": 20163 }, { "epoch": 2.5650680574990457, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.82541275024414, "learning_rate": 1e-06, "loss": 0.5727, "mean_token_accuracy": 0.8767956495285034, "num_tokens": 769386570.0, "step": 20164 }, { "epoch": 2.5651952677776366, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.21774673461914, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.8753007650375366, "num_tokens": 769427167.0, "step": 20165 }, { "epoch": 2.5653224780562267, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.05300521850586, "learning_rate": 1e-06, "loss": 0.6503, "mean_token_accuracy": 0.850866436958313, "num_tokens": 769465456.0, "step": 20166 }, { "epoch": 2.5654496883348177, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.76759719848633, "learning_rate": 1e-06, "loss": 0.5598, "mean_token_accuracy": 0.874192476272583, "num_tokens": 769497135.0, "step": 20167 }, { "epoch": 2.5655768986134078, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 50.61463928222656, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8578673005104065, "num_tokens": 769533706.0, "step": 20168 }, { "epoch": 2.5657041088919987, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.80607986450195, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8573850393295288, "num_tokens": 769578698.0, "step": 20169 }, { "epoch": 2.565831319170589, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.52717971801758, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8758928775787354, "num_tokens": 769619799.0, "step": 20170 }, { "epoch": 2.5659585294491793, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.27211380004883, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8618639707565308, "num_tokens": 769655973.0, "step": 20171 }, { "epoch": 2.56608573972777, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.12675857543945, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8578980565071106, "num_tokens": 769692333.0, "step": 20172 }, { "epoch": 2.5662129500063604, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.5494384765625, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.865312933921814, "num_tokens": 769731804.0, "step": 20173 }, { "epoch": 2.566340160284951, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.805511474609375, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8679089546203613, "num_tokens": 769767409.0, "step": 20174 }, { "epoch": 2.5664673705635415, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.03861618041992, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8587578535079956, "num_tokens": 769806468.0, "step": 20175 }, { "epoch": 2.566594580842132, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.78888702392578, "learning_rate": 1e-06, "loss": 0.5771, "mean_token_accuracy": 0.868887186050415, "num_tokens": 769842025.0, "step": 20176 }, { "epoch": 2.5667217911207225, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.60326385498047, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8567987680435181, "num_tokens": 769884125.0, "step": 20177 }, { "epoch": 2.566849001399313, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.42070388793945, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8510275483131409, "num_tokens": 769920523.0, "step": 20178 }, { "epoch": 2.5669762116779036, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.396244049072266, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8601853847503662, "num_tokens": 769960141.0, "step": 20179 }, { "epoch": 2.567103421956494, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.7918701171875, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8638899922370911, "num_tokens": 769995281.0, "step": 20180 }, { "epoch": 2.5672306322350846, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.964691162109375, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8666105270385742, "num_tokens": 770028230.0, "step": 20181 }, { "epoch": 2.567357842513675, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.369815826416016, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8688874244689941, "num_tokens": 770068996.0, "step": 20182 }, { "epoch": 2.5674850527922657, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.161163330078125, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8512303829193115, "num_tokens": 770107018.0, "step": 20183 }, { "epoch": 2.567612263070856, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.71814727783203, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8620885610580444, "num_tokens": 770140251.0, "step": 20184 }, { "epoch": 2.5677394733494467, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.0484733581543, "learning_rate": 1e-06, "loss": 0.6529, "mean_token_accuracy": 0.8541216850280762, "num_tokens": 770182241.0, "step": 20185 }, { "epoch": 2.5678666836280373, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.71138381958008, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8779935836791992, "num_tokens": 770223267.0, "step": 20186 }, { "epoch": 2.567993893906628, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.350563049316406, "learning_rate": 1e-06, "loss": 0.6459, "mean_token_accuracy": 0.851833164691925, "num_tokens": 770261856.0, "step": 20187 }, { "epoch": 2.5681211041852183, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.56340408325195, "learning_rate": 1e-06, "loss": 0.642, "mean_token_accuracy": 0.8525679111480713, "num_tokens": 770299762.0, "step": 20188 }, { "epoch": 2.5682483144638084, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.13223648071289, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8563076853752136, "num_tokens": 770333690.0, "step": 20189 }, { "epoch": 2.5683755247423994, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.04341125488281, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8675870895385742, "num_tokens": 770375755.0, "step": 20190 }, { "epoch": 2.5685027350209895, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.05514144897461, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8727855682373047, "num_tokens": 770417805.0, "step": 20191 }, { "epoch": 2.5686299452995804, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.525264739990234, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8612194061279297, "num_tokens": 770458183.0, "step": 20192 }, { "epoch": 2.5687571555781705, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.145877838134766, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8642226457595825, "num_tokens": 770488973.0, "step": 20193 }, { "epoch": 2.5688843658567615, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.686302185058594, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8550307750701904, "num_tokens": 770529030.0, "step": 20194 }, { "epoch": 2.5690115761353516, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.00183868408203, "learning_rate": 1e-06, "loss": 0.6453, "mean_token_accuracy": 0.8516169190406799, "num_tokens": 770574697.0, "step": 20195 }, { "epoch": 2.569138786413942, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.19089889526367, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.870529294013977, "num_tokens": 770615417.0, "step": 20196 }, { "epoch": 2.5692659966925326, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.00057601928711, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8485947847366333, "num_tokens": 770653228.0, "step": 20197 }, { "epoch": 2.569393206971123, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.8773193359375, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8785904049873352, "num_tokens": 770693876.0, "step": 20198 }, { "epoch": 2.5695204172497137, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.637962341308594, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8617991209030151, "num_tokens": 770728754.0, "step": 20199 }, { "epoch": 2.5696476275283042, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.74074935913086, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8611664175987244, "num_tokens": 770767961.0, "step": 20200 }, { "epoch": 2.5697748378068948, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.2977409362793, "learning_rate": 1e-06, "loss": 0.7105, "mean_token_accuracy": 0.8350087404251099, "num_tokens": 770808880.0, "step": 20201 }, { "epoch": 2.5699020480854853, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.32656478881836, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8580050468444824, "num_tokens": 770846483.0, "step": 20202 }, { "epoch": 2.570029258364076, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.5898323059082, "learning_rate": 1e-06, "loss": 0.6943, "mean_token_accuracy": 0.8419390916824341, "num_tokens": 770884234.0, "step": 20203 }, { "epoch": 2.5701564686426663, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 49.42906951904297, "learning_rate": 1e-06, "loss": 0.658, "mean_token_accuracy": 0.8476157188415527, "num_tokens": 770918785.0, "step": 20204 }, { "epoch": 2.570283678921257, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.72739791870117, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8841352462768555, "num_tokens": 770956889.0, "step": 20205 }, { "epoch": 2.5704108891998474, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 49.61649703979492, "learning_rate": 1e-06, "loss": 0.6478, "mean_token_accuracy": 0.8507612943649292, "num_tokens": 770994937.0, "step": 20206 }, { "epoch": 2.570538099478438, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.44772720336914, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8690068125724792, "num_tokens": 771031451.0, "step": 20207 }, { "epoch": 2.5706653097570284, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.0310173034668, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8705556988716125, "num_tokens": 771061575.0, "step": 20208 }, { "epoch": 2.570792520035619, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.290496826171875, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8618463277816772, "num_tokens": 771103366.0, "step": 20209 }, { "epoch": 2.5709197303142095, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.42916488647461, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8685488700866699, "num_tokens": 771142178.0, "step": 20210 }, { "epoch": 2.5710469405928, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.63801193237305, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8628947734832764, "num_tokens": 771179296.0, "step": 20211 }, { "epoch": 2.5711741508713906, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.47473907470703, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.8564989566802979, "num_tokens": 771219084.0, "step": 20212 }, { "epoch": 2.571301361149981, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.548377990722656, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8687079548835754, "num_tokens": 771250609.0, "step": 20213 }, { "epoch": 2.571428571428571, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.42074966430664, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8598066568374634, "num_tokens": 771286191.0, "step": 20214 }, { "epoch": 2.571555781707162, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.37998580932617, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.859500527381897, "num_tokens": 771320898.0, "step": 20215 }, { "epoch": 2.5716829919857522, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.84728240966797, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8547887802124023, "num_tokens": 771353977.0, "step": 20216 }, { "epoch": 2.571810202264343, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.68782424926758, "learning_rate": 1e-06, "loss": 0.5856, "mean_token_accuracy": 0.8693464994430542, "num_tokens": 771391960.0, "step": 20217 }, { "epoch": 2.5719374125429333, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.47576904296875, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8548798561096191, "num_tokens": 771427764.0, "step": 20218 }, { "epoch": 2.572064622821524, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.84738540649414, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.86314457654953, "num_tokens": 771469706.0, "step": 20219 }, { "epoch": 2.5721918331001143, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.54866027832031, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8837961554527283, "num_tokens": 771504296.0, "step": 20220 }, { "epoch": 2.572319043378705, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.840675354003906, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8629109859466553, "num_tokens": 771544496.0, "step": 20221 }, { "epoch": 2.5724462536572954, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.01835250854492, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8710260391235352, "num_tokens": 771584322.0, "step": 20222 }, { "epoch": 2.572573463935886, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.149444580078125, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.873313307762146, "num_tokens": 771621424.0, "step": 20223 }, { "epoch": 2.5727006742144765, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.06474304199219, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8629311919212341, "num_tokens": 771658173.0, "step": 20224 }, { "epoch": 2.572827884493067, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.78683853149414, "learning_rate": 1e-06, "loss": 0.6443, "mean_token_accuracy": 0.8542484045028687, "num_tokens": 771698182.0, "step": 20225 }, { "epoch": 2.5729550947716575, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.25282669067383, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8474193215370178, "num_tokens": 771731559.0, "step": 20226 }, { "epoch": 2.573082305050248, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.93699264526367, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.859610915184021, "num_tokens": 771774362.0, "step": 20227 }, { "epoch": 2.5732095153288386, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.070011138916016, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8516949415206909, "num_tokens": 771810540.0, "step": 20228 }, { "epoch": 2.573336725607429, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.0485954284668, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8667963743209839, "num_tokens": 771846221.0, "step": 20229 }, { "epoch": 2.5734639358860196, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.49673843383789, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8719059228897095, "num_tokens": 771882268.0, "step": 20230 }, { "epoch": 2.57359114616461, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.937644958496094, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8596925735473633, "num_tokens": 771913176.0, "step": 20231 }, { "epoch": 2.5737183564432007, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.17020797729492, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8554357290267944, "num_tokens": 771948163.0, "step": 20232 }, { "epoch": 2.573845566721791, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.31757736206055, "learning_rate": 1e-06, "loss": 0.5702, "mean_token_accuracy": 0.8740606307983398, "num_tokens": 771988296.0, "step": 20233 }, { "epoch": 2.5739727770003817, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 49.74800109863281, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8688609004020691, "num_tokens": 772024330.0, "step": 20234 }, { "epoch": 2.5740999872789723, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.236270904541016, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.849863588809967, "num_tokens": 772062291.0, "step": 20235 }, { "epoch": 2.574227197557563, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 49.864505767822266, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8647607564926147, "num_tokens": 772110239.0, "step": 20236 }, { "epoch": 2.5743544078361533, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.06092071533203, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8590412735939026, "num_tokens": 772153201.0, "step": 20237 }, { "epoch": 2.574481618114744, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.940711975097656, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8640201687812805, "num_tokens": 772186816.0, "step": 20238 }, { "epoch": 2.574608828393334, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.61714172363281, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8713338375091553, "num_tokens": 772224238.0, "step": 20239 }, { "epoch": 2.574736038671925, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.537410736083984, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8725535273551941, "num_tokens": 772263505.0, "step": 20240 }, { "epoch": 2.574863248950515, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.91859436035156, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8718012571334839, "num_tokens": 772303205.0, "step": 20241 }, { "epoch": 2.574990459229106, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.28728103637695, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8685721158981323, "num_tokens": 772344495.0, "step": 20242 }, { "epoch": 2.575117669507696, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.63902282714844, "learning_rate": 1e-06, "loss": 0.6367, "mean_token_accuracy": 0.8572177886962891, "num_tokens": 772380296.0, "step": 20243 }, { "epoch": 2.5752448797862866, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.651981353759766, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.876203179359436, "num_tokens": 772423933.0, "step": 20244 }, { "epoch": 2.575372090064877, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.83497619628906, "learning_rate": 1e-06, "loss": 0.663, "mean_token_accuracy": 0.8474327325820923, "num_tokens": 772464514.0, "step": 20245 }, { "epoch": 2.5754993003434676, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.493370056152344, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8815863728523254, "num_tokens": 772498501.0, "step": 20246 }, { "epoch": 2.575626510622058, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.78660202026367, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8590811491012573, "num_tokens": 772538072.0, "step": 20247 }, { "epoch": 2.5757537209006487, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.53243637084961, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.862346351146698, "num_tokens": 772577434.0, "step": 20248 }, { "epoch": 2.575880931179239, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.96577835083008, "learning_rate": 1e-06, "loss": 0.5564, "mean_token_accuracy": 0.8788307309150696, "num_tokens": 772612917.0, "step": 20249 }, { "epoch": 2.5760081414578297, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.422950744628906, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8737461566925049, "num_tokens": 772652939.0, "step": 20250 }, { "epoch": 2.5761353517364203, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.181182861328125, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8687219023704529, "num_tokens": 772688018.0, "step": 20251 }, { "epoch": 2.576262562015011, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.21809768676758, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8533686399459839, "num_tokens": 772728071.0, "step": 20252 }, { "epoch": 2.5763897722936013, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.654441833496094, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8638333082199097, "num_tokens": 772762730.0, "step": 20253 }, { "epoch": 2.576516982572192, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.55284118652344, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8673098683357239, "num_tokens": 772799125.0, "step": 20254 }, { "epoch": 2.5766441928507824, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.080196380615234, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8744690418243408, "num_tokens": 772838657.0, "step": 20255 }, { "epoch": 2.576771403129373, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.078216552734375, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8693788051605225, "num_tokens": 772876470.0, "step": 20256 }, { "epoch": 2.5768986134079634, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.78226852416992, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8570512533187866, "num_tokens": 772910369.0, "step": 20257 }, { "epoch": 2.577025823686554, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.74452590942383, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8678162693977356, "num_tokens": 772948129.0, "step": 20258 }, { "epoch": 2.5771530339651445, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.10259246826172, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8895318508148193, "num_tokens": 772985359.0, "step": 20259 }, { "epoch": 2.577280244243735, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.345882415771484, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8642182946205139, "num_tokens": 773019402.0, "step": 20260 }, { "epoch": 2.5774074545223256, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.31161117553711, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8619468212127686, "num_tokens": 773059176.0, "step": 20261 }, { "epoch": 2.5775346648009156, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.63202667236328, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8640660643577576, "num_tokens": 773094274.0, "step": 20262 }, { "epoch": 2.5776618750795066, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.67017364501953, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8626492023468018, "num_tokens": 773135679.0, "step": 20263 }, { "epoch": 2.5777890853580967, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.11760330200195, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8588722944259644, "num_tokens": 773174270.0, "step": 20264 }, { "epoch": 2.5779162956366877, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.4658317565918, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8667676448822021, "num_tokens": 773213582.0, "step": 20265 }, { "epoch": 2.5780435059152778, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.250511169433594, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8584476113319397, "num_tokens": 773244980.0, "step": 20266 }, { "epoch": 2.5781707161938687, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.1405143737793, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.8774641752243042, "num_tokens": 773287395.0, "step": 20267 }, { "epoch": 2.578297926472459, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.74756622314453, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8600785136222839, "num_tokens": 773324593.0, "step": 20268 }, { "epoch": 2.5784251367510493, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.03148651123047, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8579834699630737, "num_tokens": 773361802.0, "step": 20269 }, { "epoch": 2.57855234702964, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.51810836791992, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8687973022460938, "num_tokens": 773401348.0, "step": 20270 }, { "epoch": 2.5786795573082304, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.81568145751953, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8799999952316284, "num_tokens": 773439040.0, "step": 20271 }, { "epoch": 2.578806767586821, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.032440185546875, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8754604458808899, "num_tokens": 773471382.0, "step": 20272 }, { "epoch": 2.5789339778654115, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.71950912475586, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8515748381614685, "num_tokens": 773508924.0, "step": 20273 }, { "epoch": 2.579061188144002, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.88477325439453, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8786134719848633, "num_tokens": 773550219.0, "step": 20274 }, { "epoch": 2.5791883984225925, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.84537124633789, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8603006601333618, "num_tokens": 773589363.0, "step": 20275 }, { "epoch": 2.579315608701183, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.347267150878906, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8631917238235474, "num_tokens": 773630289.0, "step": 20276 }, { "epoch": 2.5794428189797736, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.26138687133789, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.863997220993042, "num_tokens": 773668660.0, "step": 20277 }, { "epoch": 2.579570029258364, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.668941497802734, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8598134517669678, "num_tokens": 773704318.0, "step": 20278 }, { "epoch": 2.5796972395369546, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.87569046020508, "learning_rate": 1e-06, "loss": 0.6606, "mean_token_accuracy": 0.8486429452896118, "num_tokens": 773745821.0, "step": 20279 }, { "epoch": 2.579824449815545, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.95050048828125, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8727749586105347, "num_tokens": 773782880.0, "step": 20280 }, { "epoch": 2.5799516600941357, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.63019561767578, "learning_rate": 1e-06, "loss": 0.6532, "mean_token_accuracy": 0.8479174971580505, "num_tokens": 773824880.0, "step": 20281 }, { "epoch": 2.580078870372726, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.003170013427734, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8675425052642822, "num_tokens": 773863210.0, "step": 20282 }, { "epoch": 2.5802060806513167, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.32984161376953, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8711472749710083, "num_tokens": 773903432.0, "step": 20283 }, { "epoch": 2.5803332909299073, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.91590118408203, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8701570630073547, "num_tokens": 773942424.0, "step": 20284 }, { "epoch": 2.580460501208498, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.71696090698242, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8615977764129639, "num_tokens": 773976598.0, "step": 20285 }, { "epoch": 2.5805877114870883, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.13545608520508, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8573429584503174, "num_tokens": 774016696.0, "step": 20286 }, { "epoch": 2.5807149217656784, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.3311767578125, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8560651540756226, "num_tokens": 774059046.0, "step": 20287 }, { "epoch": 2.5808421320442694, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.67054748535156, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8595309257507324, "num_tokens": 774095322.0, "step": 20288 }, { "epoch": 2.5809693423228595, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.80374526977539, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8524336814880371, "num_tokens": 774134506.0, "step": 20289 }, { "epoch": 2.5810965526014504, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.72153091430664, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8793813586235046, "num_tokens": 774176997.0, "step": 20290 }, { "epoch": 2.5812237628800405, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.9063606262207, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8567946553230286, "num_tokens": 774213900.0, "step": 20291 }, { "epoch": 2.5813509731586315, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.62827682495117, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.8561124801635742, "num_tokens": 774259687.0, "step": 20292 }, { "epoch": 2.5814781834372216, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.31068801879883, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8686426877975464, "num_tokens": 774297200.0, "step": 20293 }, { "epoch": 2.581605393715812, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.83293914794922, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8768135905265808, "num_tokens": 774336642.0, "step": 20294 }, { "epoch": 2.5817326039944026, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.23402404785156, "learning_rate": 1e-06, "loss": 0.6547, "mean_token_accuracy": 0.8478104472160339, "num_tokens": 774376762.0, "step": 20295 }, { "epoch": 2.581859814272993, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.824073791503906, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.872390627861023, "num_tokens": 774421360.0, "step": 20296 }, { "epoch": 2.5819870245515837, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.0131721496582, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8706469535827637, "num_tokens": 774460850.0, "step": 20297 }, { "epoch": 2.582114234830174, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.981327056884766, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.861722469329834, "num_tokens": 774497542.0, "step": 20298 }, { "epoch": 2.5822414451087647, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.64763641357422, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.8533409833908081, "num_tokens": 774538164.0, "step": 20299 }, { "epoch": 2.5823686553873553, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.51081848144531, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8590516448020935, "num_tokens": 774576118.0, "step": 20300 }, { "epoch": 2.582495865665946, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.04896926879883, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8859708309173584, "num_tokens": 774617506.0, "step": 20301 }, { "epoch": 2.5826230759445363, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.315345764160156, "learning_rate": 1e-06, "loss": 0.6936, "mean_token_accuracy": 0.8389937877655029, "num_tokens": 774653720.0, "step": 20302 }, { "epoch": 2.582750286223127, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.08768844604492, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8848079442977905, "num_tokens": 774691056.0, "step": 20303 }, { "epoch": 2.5828774965017174, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.59796905517578, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.856275737285614, "num_tokens": 774726514.0, "step": 20304 }, { "epoch": 2.583004706780308, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.533485412597656, "learning_rate": 1e-06, "loss": 0.6232, "mean_token_accuracy": 0.854120135307312, "num_tokens": 774760831.0, "step": 20305 }, { "epoch": 2.5831319170588984, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.00216293334961, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8748550415039062, "num_tokens": 774799399.0, "step": 20306 }, { "epoch": 2.583259127337489, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.94314193725586, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8552700281143188, "num_tokens": 774839910.0, "step": 20307 }, { "epoch": 2.5833863376160795, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.41782760620117, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8649500608444214, "num_tokens": 774875694.0, "step": 20308 }, { "epoch": 2.58351354789467, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.828521728515625, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8668460249900818, "num_tokens": 774908020.0, "step": 20309 }, { "epoch": 2.5836407581732606, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.04506301879883, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8543623685836792, "num_tokens": 774946716.0, "step": 20310 }, { "epoch": 2.583767968451851, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.94112014770508, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8575790524482727, "num_tokens": 774981732.0, "step": 20311 }, { "epoch": 2.583895178730441, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.98869323730469, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8667024374008179, "num_tokens": 775015280.0, "step": 20312 }, { "epoch": 2.584022389009032, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.774986267089844, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8706052303314209, "num_tokens": 775054518.0, "step": 20313 }, { "epoch": 2.5841495992876222, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.16890335083008, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8713880777359009, "num_tokens": 775091548.0, "step": 20314 }, { "epoch": 2.584276809566213, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.92197036743164, "learning_rate": 1e-06, "loss": 0.6427, "mean_token_accuracy": 0.8519546389579773, "num_tokens": 775130916.0, "step": 20315 }, { "epoch": 2.5844040198448033, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.61383819580078, "learning_rate": 1e-06, "loss": 0.5759, "mean_token_accuracy": 0.8761618137359619, "num_tokens": 775165875.0, "step": 20316 }, { "epoch": 2.584531230123394, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.051090240478516, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8501919507980347, "num_tokens": 775211463.0, "step": 20317 }, { "epoch": 2.5846584404019843, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.69424819946289, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8736377954483032, "num_tokens": 775248338.0, "step": 20318 }, { "epoch": 2.584785650680575, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.774940490722656, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8515275120735168, "num_tokens": 775285707.0, "step": 20319 }, { "epoch": 2.5849128609591654, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.01929473876953, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8709743618965149, "num_tokens": 775322682.0, "step": 20320 }, { "epoch": 2.585040071237756, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.44050216674805, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8738514184951782, "num_tokens": 775358274.0, "step": 20321 }, { "epoch": 2.5851672815163464, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.19490051269531, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8581675291061401, "num_tokens": 775394472.0, "step": 20322 }, { "epoch": 2.585294491794937, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.84614944458008, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8733817338943481, "num_tokens": 775433647.0, "step": 20323 }, { "epoch": 2.5854217020735275, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.38782501220703, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8528504371643066, "num_tokens": 775469159.0, "step": 20324 }, { "epoch": 2.585548912352118, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.89534378051758, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8721457719802856, "num_tokens": 775507329.0, "step": 20325 }, { "epoch": 2.5856761226307086, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.299217224121094, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8616631031036377, "num_tokens": 775544638.0, "step": 20326 }, { "epoch": 2.585803332909299, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.82208251953125, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8646087050437927, "num_tokens": 775584918.0, "step": 20327 }, { "epoch": 2.5859305431878896, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.0037841796875, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.8590065240859985, "num_tokens": 775624214.0, "step": 20328 }, { "epoch": 2.58605775346648, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.76446533203125, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8510024547576904, "num_tokens": 775668372.0, "step": 20329 }, { "epoch": 2.5861849637450707, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.869117736816406, "learning_rate": 1e-06, "loss": 0.6811, "mean_token_accuracy": 0.8357656002044678, "num_tokens": 775700578.0, "step": 20330 }, { "epoch": 2.586312174023661, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.99030685424805, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8708864450454712, "num_tokens": 775746980.0, "step": 20331 }, { "epoch": 2.5864393843022517, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.35529327392578, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8569468259811401, "num_tokens": 775780762.0, "step": 20332 }, { "epoch": 2.5865665945808423, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.901546478271484, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8684024810791016, "num_tokens": 775815591.0, "step": 20333 }, { "epoch": 2.586693804859433, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.3482666015625, "learning_rate": 1e-06, "loss": 0.6431, "mean_token_accuracy": 0.8547670841217041, "num_tokens": 775856414.0, "step": 20334 }, { "epoch": 2.5868210151380233, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.497379302978516, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8648481965065002, "num_tokens": 775896939.0, "step": 20335 }, { "epoch": 2.586948225416614, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.019378662109375, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8576223850250244, "num_tokens": 775941778.0, "step": 20336 }, { "epoch": 2.587075435695204, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.74976348876953, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8629809617996216, "num_tokens": 775979346.0, "step": 20337 }, { "epoch": 2.587202645973795, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.648902893066406, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.860636830329895, "num_tokens": 776013793.0, "step": 20338 }, { "epoch": 2.587329856252385, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.575923919677734, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8768801093101501, "num_tokens": 776058513.0, "step": 20339 }, { "epoch": 2.587457066530976, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.49534225463867, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.863969087600708, "num_tokens": 776102790.0, "step": 20340 }, { "epoch": 2.587584276809566, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.597686767578125, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8775020837783813, "num_tokens": 776133898.0, "step": 20341 }, { "epoch": 2.5877114870881566, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.63105392456055, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.8624447584152222, "num_tokens": 776167590.0, "step": 20342 }, { "epoch": 2.587838697366747, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.57434844970703, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8563448786735535, "num_tokens": 776207966.0, "step": 20343 }, { "epoch": 2.5879659076453376, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.62322235107422, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8634613752365112, "num_tokens": 776245601.0, "step": 20344 }, { "epoch": 2.588093117923928, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.61124038696289, "learning_rate": 1e-06, "loss": 0.5786, "mean_token_accuracy": 0.8723434805870056, "num_tokens": 776285377.0, "step": 20345 }, { "epoch": 2.5882203282025187, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.20849609375, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8540179133415222, "num_tokens": 776328116.0, "step": 20346 }, { "epoch": 2.588347538481109, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.031028747558594, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8477987051010132, "num_tokens": 776369008.0, "step": 20347 }, { "epoch": 2.5884747487596997, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.324012756347656, "learning_rate": 1e-06, "loss": 0.6427, "mean_token_accuracy": 0.8535236716270447, "num_tokens": 776410151.0, "step": 20348 }, { "epoch": 2.5886019590382903, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.48615646362305, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8872109651565552, "num_tokens": 776445944.0, "step": 20349 }, { "epoch": 2.588729169316881, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.962158203125, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8656731843948364, "num_tokens": 776485388.0, "step": 20350 }, { "epoch": 2.5888563795954713, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.502410888671875, "learning_rate": 1e-06, "loss": 0.6651, "mean_token_accuracy": 0.8438997864723206, "num_tokens": 776529135.0, "step": 20351 }, { "epoch": 2.588983589874062, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.841796875, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8559876680374146, "num_tokens": 776564310.0, "step": 20352 }, { "epoch": 2.5891108001526524, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.840755462646484, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8769170045852661, "num_tokens": 776609004.0, "step": 20353 }, { "epoch": 2.589238010431243, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.864322662353516, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.8699336647987366, "num_tokens": 776645796.0, "step": 20354 }, { "epoch": 2.5893652207098334, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.72864532470703, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.8539979457855225, "num_tokens": 776684248.0, "step": 20355 }, { "epoch": 2.589492430988424, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.82538604736328, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8664299249649048, "num_tokens": 776722684.0, "step": 20356 }, { "epoch": 2.5896196412670145, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.4021110534668, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8686678409576416, "num_tokens": 776764635.0, "step": 20357 }, { "epoch": 2.589746851545605, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.39580535888672, "learning_rate": 1e-06, "loss": 0.6472, "mean_token_accuracy": 0.8514593839645386, "num_tokens": 776805683.0, "step": 20358 }, { "epoch": 2.5898740618241956, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.4383430480957, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8654391765594482, "num_tokens": 776841229.0, "step": 20359 }, { "epoch": 2.5900012721027856, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.971981048583984, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8705208897590637, "num_tokens": 776881319.0, "step": 20360 }, { "epoch": 2.5901284823813766, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.435516357421875, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8690241575241089, "num_tokens": 776919436.0, "step": 20361 }, { "epoch": 2.5902556926599667, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.054298400878906, "learning_rate": 1e-06, "loss": 0.6759, "mean_token_accuracy": 0.8455703854560852, "num_tokens": 776958644.0, "step": 20362 }, { "epoch": 2.5903829029385577, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.04047775268555, "learning_rate": 1e-06, "loss": 0.5557, "mean_token_accuracy": 0.8801849484443665, "num_tokens": 776994509.0, "step": 20363 }, { "epoch": 2.5905101132171477, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.99506378173828, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8694511651992798, "num_tokens": 777037557.0, "step": 20364 }, { "epoch": 2.5906373234957387, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.2347412109375, "learning_rate": 1e-06, "loss": 0.6788, "mean_token_accuracy": 0.8437399864196777, "num_tokens": 777077237.0, "step": 20365 }, { "epoch": 2.590764533774329, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.91768264770508, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8696222305297852, "num_tokens": 777120813.0, "step": 20366 }, { "epoch": 2.5908917440529193, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.019020080566406, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8501232266426086, "num_tokens": 777155809.0, "step": 20367 }, { "epoch": 2.59101895433151, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.54368591308594, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8626982569694519, "num_tokens": 777195868.0, "step": 20368 }, { "epoch": 2.5911461646101004, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.94837951660156, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8667822480201721, "num_tokens": 777233406.0, "step": 20369 }, { "epoch": 2.591273374888691, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.94234848022461, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.864625096321106, "num_tokens": 777272817.0, "step": 20370 }, { "epoch": 2.5914005851672814, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.67440414428711, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8588036894798279, "num_tokens": 777312398.0, "step": 20371 }, { "epoch": 2.591527795445872, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.37085723876953, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.872999906539917, "num_tokens": 777349375.0, "step": 20372 }, { "epoch": 2.5916550057244625, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.771018981933594, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8719427585601807, "num_tokens": 777394806.0, "step": 20373 }, { "epoch": 2.591782216003053, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.90584182739258, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8684743642807007, "num_tokens": 777434350.0, "step": 20374 }, { "epoch": 2.5919094262816436, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.6840934753418, "learning_rate": 1e-06, "loss": 0.6801, "mean_token_accuracy": 0.836133599281311, "num_tokens": 777469690.0, "step": 20375 }, { "epoch": 2.592036636560234, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.23609924316406, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8558731079101562, "num_tokens": 777502095.0, "step": 20376 }, { "epoch": 2.5921638468388246, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.01399612426758, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8739864826202393, "num_tokens": 777540387.0, "step": 20377 }, { "epoch": 2.592291057117415, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.97694396972656, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.8700556755065918, "num_tokens": 777581855.0, "step": 20378 }, { "epoch": 2.5924182673960057, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.8046875, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8626079559326172, "num_tokens": 777619852.0, "step": 20379 }, { "epoch": 2.592545477674596, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.13001251220703, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8569995164871216, "num_tokens": 777662772.0, "step": 20380 }, { "epoch": 2.5926726879531867, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.9543571472168, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8476718664169312, "num_tokens": 777701171.0, "step": 20381 }, { "epoch": 2.5927998982317773, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.62700271606445, "learning_rate": 1e-06, "loss": 0.6605, "mean_token_accuracy": 0.8477867841720581, "num_tokens": 777738245.0, "step": 20382 }, { "epoch": 2.592927108510368, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.868221282958984, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8641500473022461, "num_tokens": 777775732.0, "step": 20383 }, { "epoch": 2.5930543187889583, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.805091857910156, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8623406887054443, "num_tokens": 777811077.0, "step": 20384 }, { "epoch": 2.5931815290675484, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.394386291503906, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8618110418319702, "num_tokens": 777854107.0, "step": 20385 }, { "epoch": 2.5933087393461394, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.26518630981445, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.8623573780059814, "num_tokens": 777891950.0, "step": 20386 }, { "epoch": 2.5934359496247295, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.1417350769043, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8803248405456543, "num_tokens": 777932638.0, "step": 20387 }, { "epoch": 2.5935631599033204, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.4890251159668, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8613376021385193, "num_tokens": 777977733.0, "step": 20388 }, { "epoch": 2.5936903701819105, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.112369537353516, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8684686422348022, "num_tokens": 778020451.0, "step": 20389 }, { "epoch": 2.5938175804605015, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.69105911254883, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8686015605926514, "num_tokens": 778059594.0, "step": 20390 }, { "epoch": 2.5939447907390916, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.73048400878906, "learning_rate": 1e-06, "loss": 0.5898, "mean_token_accuracy": 0.8672760725021362, "num_tokens": 778099235.0, "step": 20391 }, { "epoch": 2.594072001017682, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.82625198364258, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8744102716445923, "num_tokens": 778135430.0, "step": 20392 }, { "epoch": 2.5941992112962726, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.15800857543945, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8560454845428467, "num_tokens": 778175111.0, "step": 20393 }, { "epoch": 2.594326421574863, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.55802536010742, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8542153835296631, "num_tokens": 778211954.0, "step": 20394 }, { "epoch": 2.5944536318534537, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.30696487426758, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.8471481800079346, "num_tokens": 778246580.0, "step": 20395 }, { "epoch": 2.594580842132044, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.411197662353516, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8681610226631165, "num_tokens": 778288659.0, "step": 20396 }, { "epoch": 2.5947080524106347, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.81838607788086, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8622177243232727, "num_tokens": 778325365.0, "step": 20397 }, { "epoch": 2.5948352626892253, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.115333557128906, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8709032535552979, "num_tokens": 778362982.0, "step": 20398 }, { "epoch": 2.594962472967816, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.069026947021484, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8475803732872009, "num_tokens": 778394230.0, "step": 20399 }, { "epoch": 2.5950896832464063, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.523597717285156, "learning_rate": 1e-06, "loss": 0.6517, "mean_token_accuracy": 0.8513692021369934, "num_tokens": 778434345.0, "step": 20400 }, { "epoch": 2.595216893524997, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.34270095825195, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8582725524902344, "num_tokens": 778478303.0, "step": 20401 }, { "epoch": 2.5953441038035874, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.02999496459961, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8701797127723694, "num_tokens": 778513915.0, "step": 20402 }, { "epoch": 2.595471314082178, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.399925231933594, "learning_rate": 1e-06, "loss": 0.5571, "mean_token_accuracy": 0.8781201243400574, "num_tokens": 778554294.0, "step": 20403 }, { "epoch": 2.5955985243607684, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.65187454223633, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8612125515937805, "num_tokens": 778592836.0, "step": 20404 }, { "epoch": 2.595725734639359, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.508296966552734, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8547290563583374, "num_tokens": 778629316.0, "step": 20405 }, { "epoch": 2.5958529449179495, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.3109245300293, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8576864004135132, "num_tokens": 778659065.0, "step": 20406 }, { "epoch": 2.59598015519654, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.289363861083984, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8794323205947876, "num_tokens": 778693949.0, "step": 20407 }, { "epoch": 2.5961073654751305, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.63874053955078, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8689181804656982, "num_tokens": 778738045.0, "step": 20408 }, { "epoch": 2.596234575753721, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.182743072509766, "learning_rate": 1e-06, "loss": 0.742, "mean_token_accuracy": 0.8180596232414246, "num_tokens": 778769578.0, "step": 20409 }, { "epoch": 2.596361786032311, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.744720458984375, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8587979078292847, "num_tokens": 778810052.0, "step": 20410 }, { "epoch": 2.596488996310902, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.99003982543945, "learning_rate": 1e-06, "loss": 0.6342, "mean_token_accuracy": 0.8566619157791138, "num_tokens": 778850868.0, "step": 20411 }, { "epoch": 2.596616206589492, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.8969612121582, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.871350884437561, "num_tokens": 778890129.0, "step": 20412 }, { "epoch": 2.596743416868083, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.85641860961914, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8691039681434631, "num_tokens": 778923181.0, "step": 20413 }, { "epoch": 2.5968706271466733, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.96073532104492, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8556578159332275, "num_tokens": 778963328.0, "step": 20414 }, { "epoch": 2.596997837425264, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.77293014526367, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.8733357191085815, "num_tokens": 779003932.0, "step": 20415 }, { "epoch": 2.5971250477038543, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.14537811279297, "learning_rate": 1e-06, "loss": 0.6432, "mean_token_accuracy": 0.8532501459121704, "num_tokens": 779043913.0, "step": 20416 }, { "epoch": 2.597252257982445, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.796485900878906, "learning_rate": 1e-06, "loss": 0.6214, "mean_token_accuracy": 0.8594455718994141, "num_tokens": 779077126.0, "step": 20417 }, { "epoch": 2.5973794682610354, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.04273223876953, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.866263747215271, "num_tokens": 779111544.0, "step": 20418 }, { "epoch": 2.597506678539626, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.2178955078125, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8623039126396179, "num_tokens": 779145493.0, "step": 20419 }, { "epoch": 2.5976338888182164, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.6274299621582, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8552023768424988, "num_tokens": 779189216.0, "step": 20420 }, { "epoch": 2.597761099096807, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.11296463012695, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8698543906211853, "num_tokens": 779225914.0, "step": 20421 }, { "epoch": 2.5978883093753975, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.255489349365234, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8679417371749878, "num_tokens": 779264118.0, "step": 20422 }, { "epoch": 2.598015519653988, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.157623291015625, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8665438890457153, "num_tokens": 779301694.0, "step": 20423 }, { "epoch": 2.5981427299325786, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.6733512878418, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8610087633132935, "num_tokens": 779339012.0, "step": 20424 }, { "epoch": 2.598269940211169, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.186256408691406, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8675602078437805, "num_tokens": 779376358.0, "step": 20425 }, { "epoch": 2.5983971504897596, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.6314697265625, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8614392280578613, "num_tokens": 779416344.0, "step": 20426 }, { "epoch": 2.59852436076835, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.29607009887695, "learning_rate": 1e-06, "loss": 0.6549, "mean_token_accuracy": 0.850852906703949, "num_tokens": 779452189.0, "step": 20427 }, { "epoch": 2.5986515710469407, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.47489929199219, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8630725741386414, "num_tokens": 779493513.0, "step": 20428 }, { "epoch": 2.598778781325531, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.921451568603516, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8685147762298584, "num_tokens": 779533061.0, "step": 20429 }, { "epoch": 2.5989059916041217, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.651214599609375, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8564997911453247, "num_tokens": 779570159.0, "step": 20430 }, { "epoch": 2.5990332018827123, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.450164794921875, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8611010313034058, "num_tokens": 779609066.0, "step": 20431 }, { "epoch": 2.599160412161303, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.340518951416016, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.862360417842865, "num_tokens": 779651087.0, "step": 20432 }, { "epoch": 2.5992876224398933, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.440364837646484, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.8757826685905457, "num_tokens": 779689531.0, "step": 20433 }, { "epoch": 2.599414832718484, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.44841003417969, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8597815036773682, "num_tokens": 779724062.0, "step": 20434 }, { "epoch": 2.599542042997074, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.33487319946289, "learning_rate": 1e-06, "loss": 0.6588, "mean_token_accuracy": 0.8494552373886108, "num_tokens": 779766029.0, "step": 20435 }, { "epoch": 2.599669253275665, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.31974792480469, "learning_rate": 1e-06, "loss": 0.6582, "mean_token_accuracy": 0.8470544815063477, "num_tokens": 779811065.0, "step": 20436 }, { "epoch": 2.599796463554255, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.84067916870117, "learning_rate": 1e-06, "loss": 0.6603, "mean_token_accuracy": 0.8485496640205383, "num_tokens": 779852442.0, "step": 20437 }, { "epoch": 2.599923673832846, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.0952033996582, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.868681788444519, "num_tokens": 779885993.0, "step": 20438 }, { "epoch": 2.600050884111436, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.29513931274414, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8670910596847534, "num_tokens": 779927791.0, "step": 20439 }, { "epoch": 2.6001780943900266, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.47700500488281, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8664920926094055, "num_tokens": 779961517.0, "step": 20440 }, { "epoch": 2.600305304668617, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.34078598022461, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8500211238861084, "num_tokens": 779992902.0, "step": 20441 }, { "epoch": 2.6004325149472076, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.25691604614258, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8633179068565369, "num_tokens": 780032849.0, "step": 20442 }, { "epoch": 2.600559725225798, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.06660079956055, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8743231892585754, "num_tokens": 780072563.0, "step": 20443 }, { "epoch": 2.6006869355043887, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.573177337646484, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8635674715042114, "num_tokens": 780115299.0, "step": 20444 }, { "epoch": 2.600814145782979, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.482154846191406, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8605363965034485, "num_tokens": 780155792.0, "step": 20445 }, { "epoch": 2.6009413560615697, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.43059539794922, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8581452369689941, "num_tokens": 780189040.0, "step": 20446 }, { "epoch": 2.6010685663401603, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.66086959838867, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8626581430435181, "num_tokens": 780229463.0, "step": 20447 }, { "epoch": 2.601195776618751, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.2948112487793, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8728065490722656, "num_tokens": 780265961.0, "step": 20448 }, { "epoch": 2.6013229868973413, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.86240005493164, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8570391535758972, "num_tokens": 780303220.0, "step": 20449 }, { "epoch": 2.601450197175932, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.421119689941406, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8606681823730469, "num_tokens": 780340456.0, "step": 20450 }, { "epoch": 2.6015774074545224, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.77710723876953, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8606643676757812, "num_tokens": 780381402.0, "step": 20451 }, { "epoch": 2.601704617733113, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.461265563964844, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8669040203094482, "num_tokens": 780422571.0, "step": 20452 }, { "epoch": 2.6018318280117034, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.820125579833984, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8585205078125, "num_tokens": 780462447.0, "step": 20453 }, { "epoch": 2.601959038290294, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.21874237060547, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8631621599197388, "num_tokens": 780499183.0, "step": 20454 }, { "epoch": 2.6020862485688845, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.9153938293457, "learning_rate": 1e-06, "loss": 0.6539, "mean_token_accuracy": 0.8487681150436401, "num_tokens": 780539216.0, "step": 20455 }, { "epoch": 2.602213458847475, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.04629135131836, "learning_rate": 1e-06, "loss": 0.5918, "mean_token_accuracy": 0.869397759437561, "num_tokens": 780578725.0, "step": 20456 }, { "epoch": 2.6023406691260655, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.16463088989258, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8634015321731567, "num_tokens": 780615175.0, "step": 20457 }, { "epoch": 2.6024678794046556, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.57883834838867, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.865278422832489, "num_tokens": 780651186.0, "step": 20458 }, { "epoch": 2.6025950896832466, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.64762878417969, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8550270199775696, "num_tokens": 780685550.0, "step": 20459 }, { "epoch": 2.6027222999618367, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.4178466796875, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8857783079147339, "num_tokens": 780725477.0, "step": 20460 }, { "epoch": 2.6028495102404277, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.47041320800781, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8665132522583008, "num_tokens": 780762481.0, "step": 20461 }, { "epoch": 2.6029767205190177, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.35683059692383, "learning_rate": 1e-06, "loss": 0.6773, "mean_token_accuracy": 0.8432703018188477, "num_tokens": 780800305.0, "step": 20462 }, { "epoch": 2.6031039307976087, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.732208251953125, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8649829626083374, "num_tokens": 780836271.0, "step": 20463 }, { "epoch": 2.603231141076199, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.10684585571289, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8754265308380127, "num_tokens": 780877363.0, "step": 20464 }, { "epoch": 2.6033583513547893, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 50.50010299682617, "learning_rate": 1e-06, "loss": 0.6612, "mean_token_accuracy": 0.8503757119178772, "num_tokens": 780922982.0, "step": 20465 }, { "epoch": 2.60348556163338, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.90935134887695, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8605211973190308, "num_tokens": 780957925.0, "step": 20466 }, { "epoch": 2.6036127719119704, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.119178771972656, "learning_rate": 1e-06, "loss": 0.6857, "mean_token_accuracy": 0.8432719707489014, "num_tokens": 780996922.0, "step": 20467 }, { "epoch": 2.603739982190561, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.119197845458984, "learning_rate": 1e-06, "loss": 0.6885, "mean_token_accuracy": 0.8445156216621399, "num_tokens": 781037209.0, "step": 20468 }, { "epoch": 2.6038671924691514, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.089725494384766, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8558934926986694, "num_tokens": 781078077.0, "step": 20469 }, { "epoch": 2.603994402747742, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.26174545288086, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8604291677474976, "num_tokens": 781122516.0, "step": 20470 }, { "epoch": 2.6041216130263325, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.92315673828125, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8543633818626404, "num_tokens": 781156290.0, "step": 20471 }, { "epoch": 2.604248823304923, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 50.9344367980957, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8701227903366089, "num_tokens": 781192769.0, "step": 20472 }, { "epoch": 2.6043760335835135, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.72406005859375, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8682108521461487, "num_tokens": 781227594.0, "step": 20473 }, { "epoch": 2.604503243862104, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.3359375, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8723539113998413, "num_tokens": 781259369.0, "step": 20474 }, { "epoch": 2.6046304541406946, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.65846633911133, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8611844778060913, "num_tokens": 781296788.0, "step": 20475 }, { "epoch": 2.604757664419285, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.914608001708984, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8643596172332764, "num_tokens": 781333103.0, "step": 20476 }, { "epoch": 2.6048848746978757, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.18769836425781, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8717223405838013, "num_tokens": 781372691.0, "step": 20477 }, { "epoch": 2.605012084976466, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.218910217285156, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8573850393295288, "num_tokens": 781410480.0, "step": 20478 }, { "epoch": 2.6051392952550567, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.9898796081543, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8546639680862427, "num_tokens": 781454110.0, "step": 20479 }, { "epoch": 2.6052665055336472, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.220550537109375, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.860476016998291, "num_tokens": 781494034.0, "step": 20480 }, { "epoch": 2.6053937158122378, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.12928009033203, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8603920936584473, "num_tokens": 781536225.0, "step": 20481 }, { "epoch": 2.6055209260908283, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 50.837440490722656, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8681395053863525, "num_tokens": 781573987.0, "step": 20482 }, { "epoch": 2.6056481363694184, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.50316619873047, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.866234302520752, "num_tokens": 781608987.0, "step": 20483 }, { "epoch": 2.6057753466480094, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.472660064697266, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8681777715682983, "num_tokens": 781646185.0, "step": 20484 }, { "epoch": 2.6059025569265994, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.95887756347656, "learning_rate": 1e-06, "loss": 0.6481, "mean_token_accuracy": 0.8513973951339722, "num_tokens": 781682999.0, "step": 20485 }, { "epoch": 2.6060297672051904, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.73784255981445, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8596599698066711, "num_tokens": 781726052.0, "step": 20486 }, { "epoch": 2.6061569774837805, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.84854507446289, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8768206834793091, "num_tokens": 781760562.0, "step": 20487 }, { "epoch": 2.6062841877623715, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.833011627197266, "learning_rate": 1e-06, "loss": 0.6459, "mean_token_accuracy": 0.8505198955535889, "num_tokens": 781803765.0, "step": 20488 }, { "epoch": 2.6064113980409616, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.59731674194336, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8617041707038879, "num_tokens": 781845419.0, "step": 20489 }, { "epoch": 2.606538608319552, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.656429290771484, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8710191249847412, "num_tokens": 781883371.0, "step": 20490 }, { "epoch": 2.6066658185981426, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.50605392456055, "learning_rate": 1e-06, "loss": 0.6196, "mean_token_accuracy": 0.8613593578338623, "num_tokens": 781919669.0, "step": 20491 }, { "epoch": 2.606793028876733, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.03109359741211, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8691865801811218, "num_tokens": 781953212.0, "step": 20492 }, { "epoch": 2.6069202391553237, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 49.8130989074707, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8663091659545898, "num_tokens": 781996824.0, "step": 20493 }, { "epoch": 2.607047449433914, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.36574172973633, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8816535472869873, "num_tokens": 782035350.0, "step": 20494 }, { "epoch": 2.6071746597125047, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 49.423160552978516, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8561320304870605, "num_tokens": 782073217.0, "step": 20495 }, { "epoch": 2.6073018699910953, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.29252624511719, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.8556269407272339, "num_tokens": 782110777.0, "step": 20496 }, { "epoch": 2.607429080269686, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 49.8204231262207, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8558719158172607, "num_tokens": 782153300.0, "step": 20497 }, { "epoch": 2.6075562905482763, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 50.903743743896484, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8682193160057068, "num_tokens": 782191945.0, "step": 20498 }, { "epoch": 2.607683500826867, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 49.83343505859375, "learning_rate": 1e-06, "loss": 0.579, "mean_token_accuracy": 0.8725631833076477, "num_tokens": 782230472.0, "step": 20499 }, { "epoch": 2.6078107111054574, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.48306655883789, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8521797060966492, "num_tokens": 782263868.0, "step": 20500 }, { "epoch": 2.607937921384048, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.29884719848633, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8651507496833801, "num_tokens": 782299781.0, "step": 20501 }, { "epoch": 2.6080651316626384, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 50.59452819824219, "learning_rate": 1e-06, "loss": 0.6452, "mean_token_accuracy": 0.8558651804924011, "num_tokens": 782343376.0, "step": 20502 }, { "epoch": 2.608192341941229, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.82394027709961, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8749734163284302, "num_tokens": 782389894.0, "step": 20503 }, { "epoch": 2.6083195522198195, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.58700180053711, "learning_rate": 1e-06, "loss": 0.5989, "mean_token_accuracy": 0.8652687072753906, "num_tokens": 782430055.0, "step": 20504 }, { "epoch": 2.60844676249841, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.58992385864258, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8632696866989136, "num_tokens": 782467211.0, "step": 20505 }, { "epoch": 2.6085739727770005, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.72797775268555, "learning_rate": 1e-06, "loss": 0.6436, "mean_token_accuracy": 0.8553646206855774, "num_tokens": 782505913.0, "step": 20506 }, { "epoch": 2.608701183055591, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.35000228881836, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8626528978347778, "num_tokens": 782541858.0, "step": 20507 }, { "epoch": 2.608828393334181, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.089962005615234, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8627501726150513, "num_tokens": 782575854.0, "step": 20508 }, { "epoch": 2.608955603612772, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.07058334350586, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8649146556854248, "num_tokens": 782614005.0, "step": 20509 }, { "epoch": 2.609082813891362, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.14497756958008, "learning_rate": 1e-06, "loss": 0.5995, "mean_token_accuracy": 0.8708117008209229, "num_tokens": 782650147.0, "step": 20510 }, { "epoch": 2.609210024169953, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.411041259765625, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8635362386703491, "num_tokens": 782683215.0, "step": 20511 }, { "epoch": 2.6093372344485433, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.72346878051758, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8681429624557495, "num_tokens": 782721184.0, "step": 20512 }, { "epoch": 2.609464444727134, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.024837493896484, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8572392463684082, "num_tokens": 782759891.0, "step": 20513 }, { "epoch": 2.6095916550057243, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.951805114746094, "learning_rate": 1e-06, "loss": 0.6549, "mean_token_accuracy": 0.852062463760376, "num_tokens": 782796398.0, "step": 20514 }, { "epoch": 2.609718865284315, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.35059356689453, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8723282814025879, "num_tokens": 782835968.0, "step": 20515 }, { "epoch": 2.6098460755629054, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.198760986328125, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8743636012077332, "num_tokens": 782871367.0, "step": 20516 }, { "epoch": 2.609973285841496, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.672813415527344, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8849866986274719, "num_tokens": 782905766.0, "step": 20517 }, { "epoch": 2.6101004961200864, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.985923767089844, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8686401844024658, "num_tokens": 782942256.0, "step": 20518 }, { "epoch": 2.610227706398677, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.37551498413086, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8743524551391602, "num_tokens": 782976156.0, "step": 20519 }, { "epoch": 2.6103549166772675, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.968101501464844, "learning_rate": 1e-06, "loss": 0.585, "mean_token_accuracy": 0.8723409175872803, "num_tokens": 783011450.0, "step": 20520 }, { "epoch": 2.610482126955858, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.45656967163086, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8673496246337891, "num_tokens": 783050355.0, "step": 20521 }, { "epoch": 2.6106093372344485, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.95196533203125, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8723162412643433, "num_tokens": 783091621.0, "step": 20522 }, { "epoch": 2.610736547513039, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.24403762817383, "learning_rate": 1e-06, "loss": 0.6536, "mean_token_accuracy": 0.8531700372695923, "num_tokens": 783130471.0, "step": 20523 }, { "epoch": 2.6108637577916296, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.38818359375, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8663580417633057, "num_tokens": 783168340.0, "step": 20524 }, { "epoch": 2.61099096807022, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.491432189941406, "learning_rate": 1e-06, "loss": 0.6653, "mean_token_accuracy": 0.844349205493927, "num_tokens": 783207279.0, "step": 20525 }, { "epoch": 2.6111181783488107, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.02329635620117, "learning_rate": 1e-06, "loss": 0.6348, "mean_token_accuracy": 0.8554720878601074, "num_tokens": 783243010.0, "step": 20526 }, { "epoch": 2.611245388627401, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.850154876708984, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8526798486709595, "num_tokens": 783287548.0, "step": 20527 }, { "epoch": 2.6113725989059917, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.28189468383789, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8602434396743774, "num_tokens": 783331549.0, "step": 20528 }, { "epoch": 2.6114998091845822, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.96259689331055, "learning_rate": 1e-06, "loss": 0.5843, "mean_token_accuracy": 0.8720259666442871, "num_tokens": 783369445.0, "step": 20529 }, { "epoch": 2.6116270194631728, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.16875457763672, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8587208986282349, "num_tokens": 783413218.0, "step": 20530 }, { "epoch": 2.6117542297417633, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.84218215942383, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8611078262329102, "num_tokens": 783449944.0, "step": 20531 }, { "epoch": 2.611881440020354, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.19002151489258, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8632423877716064, "num_tokens": 783486142.0, "step": 20532 }, { "epoch": 2.612008650298944, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.061622619628906, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8695011138916016, "num_tokens": 783521170.0, "step": 20533 }, { "epoch": 2.612135860577535, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.80509567260742, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8610057830810547, "num_tokens": 783563902.0, "step": 20534 }, { "epoch": 2.612263070856125, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.0252571105957, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.869331955909729, "num_tokens": 783597655.0, "step": 20535 }, { "epoch": 2.612390281134716, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.038455963134766, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8518736362457275, "num_tokens": 783635233.0, "step": 20536 }, { "epoch": 2.612517491413306, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.07453918457031, "learning_rate": 1e-06, "loss": 0.5982, "mean_token_accuracy": 0.8641985654830933, "num_tokens": 783679677.0, "step": 20537 }, { "epoch": 2.6126447016918966, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.14342498779297, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8688129186630249, "num_tokens": 783720608.0, "step": 20538 }, { "epoch": 2.612771911970487, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.62100601196289, "learning_rate": 1e-06, "loss": 0.5768, "mean_token_accuracy": 0.8729796409606934, "num_tokens": 783753198.0, "step": 20539 }, { "epoch": 2.6128991222490776, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.146331787109375, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8623003959655762, "num_tokens": 783791068.0, "step": 20540 }, { "epoch": 2.613026332527668, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.53826904296875, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8641250729560852, "num_tokens": 783831319.0, "step": 20541 }, { "epoch": 2.6131535428062587, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.04107666015625, "learning_rate": 1e-06, "loss": 0.5872, "mean_token_accuracy": 0.8669722080230713, "num_tokens": 783867537.0, "step": 20542 }, { "epoch": 2.613280753084849, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.9310417175293, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8571468591690063, "num_tokens": 783907787.0, "step": 20543 }, { "epoch": 2.6134079633634397, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.80366134643555, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8561787605285645, "num_tokens": 783948558.0, "step": 20544 }, { "epoch": 2.6135351736420303, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.04155731201172, "learning_rate": 1e-06, "loss": 0.5517, "mean_token_accuracy": 0.8819996118545532, "num_tokens": 783986726.0, "step": 20545 }, { "epoch": 2.613662383920621, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.83901596069336, "learning_rate": 1e-06, "loss": 0.6517, "mean_token_accuracy": 0.8439382314682007, "num_tokens": 784023449.0, "step": 20546 }, { "epoch": 2.6137895941992113, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.853214263916016, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8806091547012329, "num_tokens": 784059529.0, "step": 20547 }, { "epoch": 2.613916804477802, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.10914993286133, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.865684986114502, "num_tokens": 784103530.0, "step": 20548 }, { "epoch": 2.6140440147563924, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.09666061401367, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8561426997184753, "num_tokens": 784141721.0, "step": 20549 }, { "epoch": 2.614171225034983, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.81364059448242, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8648974895477295, "num_tokens": 784175868.0, "step": 20550 }, { "epoch": 2.6142984353135734, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.391258239746094, "learning_rate": 1e-06, "loss": 0.5742, "mean_token_accuracy": 0.8747419714927673, "num_tokens": 784210646.0, "step": 20551 }, { "epoch": 2.614425645592164, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.32194900512695, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8684647083282471, "num_tokens": 784255588.0, "step": 20552 }, { "epoch": 2.6145528558707545, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.19085693359375, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8651750683784485, "num_tokens": 784292877.0, "step": 20553 }, { "epoch": 2.614680066149345, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.781394958496094, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8671606779098511, "num_tokens": 784336150.0, "step": 20554 }, { "epoch": 2.6148072764279355, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.16169738769531, "learning_rate": 1e-06, "loss": 0.5584, "mean_token_accuracy": 0.8771610260009766, "num_tokens": 784371547.0, "step": 20555 }, { "epoch": 2.6149344867065256, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.05113983154297, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8665498495101929, "num_tokens": 784411473.0, "step": 20556 }, { "epoch": 2.6150616969851166, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.724666595458984, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8547724485397339, "num_tokens": 784456713.0, "step": 20557 }, { "epoch": 2.6151889072637067, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.21337890625, "learning_rate": 1e-06, "loss": 0.5628, "mean_token_accuracy": 0.8779506683349609, "num_tokens": 784493685.0, "step": 20558 }, { "epoch": 2.6153161175422976, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.96194076538086, "learning_rate": 1e-06, "loss": 0.5736, "mean_token_accuracy": 0.876050591468811, "num_tokens": 784534641.0, "step": 20559 }, { "epoch": 2.6154433278208877, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.442806243896484, "learning_rate": 1e-06, "loss": 0.6839, "mean_token_accuracy": 0.8408588767051697, "num_tokens": 784576184.0, "step": 20560 }, { "epoch": 2.6155705380994787, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.7213020324707, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8663938045501709, "num_tokens": 784613851.0, "step": 20561 }, { "epoch": 2.615697748378069, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.03269577026367, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8658902645111084, "num_tokens": 784654133.0, "step": 20562 }, { "epoch": 2.6158249586566593, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.95695877075195, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8656435012817383, "num_tokens": 784690529.0, "step": 20563 }, { "epoch": 2.61595216893525, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.95646667480469, "learning_rate": 1e-06, "loss": 0.6058, "mean_token_accuracy": 0.8637459874153137, "num_tokens": 784731972.0, "step": 20564 }, { "epoch": 2.6160793792138404, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.98050308227539, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8589679002761841, "num_tokens": 784773275.0, "step": 20565 }, { "epoch": 2.616206589492431, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.27688980102539, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.8480074405670166, "num_tokens": 784811468.0, "step": 20566 }, { "epoch": 2.6163337997710214, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.967952728271484, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8651896715164185, "num_tokens": 784850611.0, "step": 20567 }, { "epoch": 2.616461010049612, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.985260009765625, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8687872886657715, "num_tokens": 784886502.0, "step": 20568 }, { "epoch": 2.6165882203282025, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.00737762451172, "learning_rate": 1e-06, "loss": 0.644, "mean_token_accuracy": 0.8542493581771851, "num_tokens": 784924489.0, "step": 20569 }, { "epoch": 2.616715430606793, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.20480728149414, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8670961856842041, "num_tokens": 784957224.0, "step": 20570 }, { "epoch": 2.6168426408853835, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.27297592163086, "learning_rate": 1e-06, "loss": 0.6938, "mean_token_accuracy": 0.838279128074646, "num_tokens": 784997754.0, "step": 20571 }, { "epoch": 2.616969851163974, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.61988830566406, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.8488954305648804, "num_tokens": 785033932.0, "step": 20572 }, { "epoch": 2.6170970614425646, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.54887008666992, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8622657656669617, "num_tokens": 785073190.0, "step": 20573 }, { "epoch": 2.617224271721155, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.622562408447266, "learning_rate": 1e-06, "loss": 0.5504, "mean_token_accuracy": 0.8814886808395386, "num_tokens": 785108397.0, "step": 20574 }, { "epoch": 2.6173514819997457, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.33378982543945, "learning_rate": 1e-06, "loss": 0.5326, "mean_token_accuracy": 0.8912121057510376, "num_tokens": 785142671.0, "step": 20575 }, { "epoch": 2.617478692278336, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.990203857421875, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.8619387149810791, "num_tokens": 785183467.0, "step": 20576 }, { "epoch": 2.6176059025569267, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.56523513793945, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8539073467254639, "num_tokens": 785222072.0, "step": 20577 }, { "epoch": 2.6177331128355172, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.63774108886719, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.8526183366775513, "num_tokens": 785260458.0, "step": 20578 }, { "epoch": 2.6178603231141078, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.160709381103516, "learning_rate": 1e-06, "loss": 0.6563, "mean_token_accuracy": 0.8507575392723083, "num_tokens": 785302434.0, "step": 20579 }, { "epoch": 2.6179875333926983, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.024009704589844, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8468660712242126, "num_tokens": 785338251.0, "step": 20580 }, { "epoch": 2.6181147436712884, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.17546844482422, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8590341806411743, "num_tokens": 785384789.0, "step": 20581 }, { "epoch": 2.6182419539498794, "ewc_loss": 0.2001953125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017642974853515625, "grad_norm": 50.200618743896484, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8628662824630737, "num_tokens": 785424978.0, "step": 20582 }, { "epoch": 2.6183691642284694, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.68631362915039, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8559104204177856, "num_tokens": 785464421.0, "step": 20583 }, { "epoch": 2.6184963745070604, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.980587005615234, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8819689750671387, "num_tokens": 785496166.0, "step": 20584 }, { "epoch": 2.6186235847856505, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.87553405761719, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8488172292709351, "num_tokens": 785528504.0, "step": 20585 }, { "epoch": 2.6187507950642415, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.642024993896484, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8658524751663208, "num_tokens": 785570896.0, "step": 20586 }, { "epoch": 2.6188780053428315, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.6911735534668, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8597538471221924, "num_tokens": 785612464.0, "step": 20587 }, { "epoch": 2.619005215621422, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.655059814453125, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8632078170776367, "num_tokens": 785655500.0, "step": 20588 }, { "epoch": 2.6191324259000126, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.260189056396484, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8655832409858704, "num_tokens": 785691186.0, "step": 20589 }, { "epoch": 2.619259636178603, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.61458206176758, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.877250611782074, "num_tokens": 785728575.0, "step": 20590 }, { "epoch": 2.6193868464571937, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.20402145385742, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8611167669296265, "num_tokens": 785765782.0, "step": 20591 }, { "epoch": 2.619514056735784, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.59514617919922, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.867943286895752, "num_tokens": 785800130.0, "step": 20592 }, { "epoch": 2.6196412670143747, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.38774490356445, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8912686705589294, "num_tokens": 785836647.0, "step": 20593 }, { "epoch": 2.6197684772929652, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.49481964111328, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8563047647476196, "num_tokens": 785873897.0, "step": 20594 }, { "epoch": 2.6198956875715558, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.66555404663086, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8588138818740845, "num_tokens": 785914296.0, "step": 20595 }, { "epoch": 2.6200228978501463, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.17124557495117, "learning_rate": 1e-06, "loss": 0.6518, "mean_token_accuracy": 0.8501195907592773, "num_tokens": 785951426.0, "step": 20596 }, { "epoch": 2.620150108128737, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.82231521606445, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8666540384292603, "num_tokens": 785989906.0, "step": 20597 }, { "epoch": 2.6202773184073274, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.26650619506836, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8582736253738403, "num_tokens": 786024763.0, "step": 20598 }, { "epoch": 2.620404528685918, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.041954040527344, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.8590909242630005, "num_tokens": 786063055.0, "step": 20599 }, { "epoch": 2.6205317389645084, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.96727752685547, "learning_rate": 1e-06, "loss": 0.7118, "mean_token_accuracy": 0.8361101150512695, "num_tokens": 786097675.0, "step": 20600 }, { "epoch": 2.620658949243099, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.08692932128906, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8746646642684937, "num_tokens": 786133721.0, "step": 20601 }, { "epoch": 2.6207861595216895, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.124019622802734, "learning_rate": 1e-06, "loss": 0.6723, "mean_token_accuracy": 0.8452489972114563, "num_tokens": 786179615.0, "step": 20602 }, { "epoch": 2.62091336980028, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.47299575805664, "learning_rate": 1e-06, "loss": 0.5631, "mean_token_accuracy": 0.8764188289642334, "num_tokens": 786214945.0, "step": 20603 }, { "epoch": 2.6210405800788705, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.0110969543457, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.8647929430007935, "num_tokens": 786250880.0, "step": 20604 }, { "epoch": 2.621167790357461, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.252525329589844, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8641629219055176, "num_tokens": 786286095.0, "step": 20605 }, { "epoch": 2.621295000636051, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.39127731323242, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8665837645530701, "num_tokens": 786319547.0, "step": 20606 }, { "epoch": 2.621422210914642, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.485740661621094, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8621315956115723, "num_tokens": 786358699.0, "step": 20607 }, { "epoch": 2.621549421193232, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.93400192260742, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8666448593139648, "num_tokens": 786399732.0, "step": 20608 }, { "epoch": 2.621676631471823, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.676517486572266, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8532965183258057, "num_tokens": 786437375.0, "step": 20609 }, { "epoch": 2.6218038417504133, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.84995651245117, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.851253867149353, "num_tokens": 786475538.0, "step": 20610 }, { "epoch": 2.621931052029004, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.7289924621582, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8648099899291992, "num_tokens": 786516802.0, "step": 20611 }, { "epoch": 2.6220582623075943, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.66264343261719, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.864346981048584, "num_tokens": 786558470.0, "step": 20612 }, { "epoch": 2.622185472586185, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.07106399536133, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8581027984619141, "num_tokens": 786601163.0, "step": 20613 }, { "epoch": 2.6223126828647754, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.528133392333984, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8707119226455688, "num_tokens": 786636741.0, "step": 20614 }, { "epoch": 2.622439893143366, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.17023468017578, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8504951000213623, "num_tokens": 786672931.0, "step": 20615 }, { "epoch": 2.6225671034219564, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.79499816894531, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8753995895385742, "num_tokens": 786709310.0, "step": 20616 }, { "epoch": 2.622694313700547, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.996315002441406, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8701321482658386, "num_tokens": 786747113.0, "step": 20617 }, { "epoch": 2.6228215239791375, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.02683639526367, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.862842321395874, "num_tokens": 786782033.0, "step": 20618 }, { "epoch": 2.622948734257728, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.93219757080078, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8645722270011902, "num_tokens": 786827229.0, "step": 20619 }, { "epoch": 2.6230759445363185, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.02070617675781, "learning_rate": 1e-06, "loss": 0.5792, "mean_token_accuracy": 0.8722438216209412, "num_tokens": 786861722.0, "step": 20620 }, { "epoch": 2.623203154814909, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.8563117980957, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.866234302520752, "num_tokens": 786896202.0, "step": 20621 }, { "epoch": 2.6233303650934996, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.46194076538086, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8659608364105225, "num_tokens": 786932154.0, "step": 20622 }, { "epoch": 2.62345757537209, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.58598327636719, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8544673919677734, "num_tokens": 786972184.0, "step": 20623 }, { "epoch": 2.6235847856506807, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.91629409790039, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8567701578140259, "num_tokens": 787015514.0, "step": 20624 }, { "epoch": 2.623711995929271, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.698944091796875, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8614407181739807, "num_tokens": 787054023.0, "step": 20625 }, { "epoch": 2.6238392062078617, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.860252380371094, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8594645857810974, "num_tokens": 787088858.0, "step": 20626 }, { "epoch": 2.6239664164864522, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.54053497314453, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8661811351776123, "num_tokens": 787124805.0, "step": 20627 }, { "epoch": 2.6240936267650428, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.71939468383789, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8762555122375488, "num_tokens": 787164922.0, "step": 20628 }, { "epoch": 2.6242208370436333, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.618141174316406, "learning_rate": 1e-06, "loss": 0.5649, "mean_token_accuracy": 0.8751335144042969, "num_tokens": 787204936.0, "step": 20629 }, { "epoch": 2.624348047322224, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.5163459777832, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8761826753616333, "num_tokens": 787238292.0, "step": 20630 }, { "epoch": 2.624475257600814, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.83357238769531, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8821322321891785, "num_tokens": 787269054.0, "step": 20631 }, { "epoch": 2.624602467879405, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.53961944580078, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8608992099761963, "num_tokens": 787310022.0, "step": 20632 }, { "epoch": 2.624729678157995, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.22332000732422, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8547495603561401, "num_tokens": 787350917.0, "step": 20633 }, { "epoch": 2.624856888436586, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.01874923706055, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8718335628509521, "num_tokens": 787388187.0, "step": 20634 }, { "epoch": 2.624984098715176, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.9952278137207, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8682667016983032, "num_tokens": 787424998.0, "step": 20635 }, { "epoch": 2.6251113089937665, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.95857620239258, "learning_rate": 1e-06, "loss": 0.571, "mean_token_accuracy": 0.8751461505889893, "num_tokens": 787465456.0, "step": 20636 }, { "epoch": 2.625238519272357, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.295021057128906, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8620449304580688, "num_tokens": 787504797.0, "step": 20637 }, { "epoch": 2.6253657295509476, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.84305191040039, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8745576739311218, "num_tokens": 787542911.0, "step": 20638 }, { "epoch": 2.625492939829538, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.26506423950195, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8551511764526367, "num_tokens": 787576143.0, "step": 20639 }, { "epoch": 2.6256201501081287, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.56337356567383, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8656445741653442, "num_tokens": 787619912.0, "step": 20640 }, { "epoch": 2.625747360386719, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.531490325927734, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8531058430671692, "num_tokens": 787658181.0, "step": 20641 }, { "epoch": 2.6258745706653097, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.15201187133789, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8544963598251343, "num_tokens": 787694889.0, "step": 20642 }, { "epoch": 2.6260017809439002, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.7324104309082, "learning_rate": 1e-06, "loss": 0.5643, "mean_token_accuracy": 0.8773345947265625, "num_tokens": 787737633.0, "step": 20643 }, { "epoch": 2.6261289912224908, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.44672393798828, "learning_rate": 1e-06, "loss": 0.6591, "mean_token_accuracy": 0.8507155179977417, "num_tokens": 787779069.0, "step": 20644 }, { "epoch": 2.6262562015010813, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.051422119140625, "learning_rate": 1e-06, "loss": 0.6633, "mean_token_accuracy": 0.8515305519104004, "num_tokens": 787815544.0, "step": 20645 }, { "epoch": 2.626383411779672, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.28453063964844, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.85820472240448, "num_tokens": 787852684.0, "step": 20646 }, { "epoch": 2.6265106220582624, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.04378890991211, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8609431982040405, "num_tokens": 787893452.0, "step": 20647 }, { "epoch": 2.626637832336853, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.88832473754883, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8633255958557129, "num_tokens": 787929430.0, "step": 20648 }, { "epoch": 2.6267650426154434, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.9073371887207, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8607991933822632, "num_tokens": 787969768.0, "step": 20649 }, { "epoch": 2.626892252894034, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.6246452331543, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8636267185211182, "num_tokens": 788009605.0, "step": 20650 }, { "epoch": 2.6270194631726245, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.57670593261719, "learning_rate": 1e-06, "loss": 0.5676, "mean_token_accuracy": 0.8765366077423096, "num_tokens": 788044652.0, "step": 20651 }, { "epoch": 2.627146673451215, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.488033294677734, "learning_rate": 1e-06, "loss": 0.7019, "mean_token_accuracy": 0.8360086679458618, "num_tokens": 788085981.0, "step": 20652 }, { "epoch": 2.6272738837298055, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.7149658203125, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.86070716381073, "num_tokens": 788132196.0, "step": 20653 }, { "epoch": 2.6274010940083956, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.77460861206055, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.874968409538269, "num_tokens": 788160099.0, "step": 20654 }, { "epoch": 2.6275283042869866, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.755245208740234, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8597427606582642, "num_tokens": 788197264.0, "step": 20655 }, { "epoch": 2.6276555145655767, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.85403060913086, "learning_rate": 1e-06, "loss": 0.576, "mean_token_accuracy": 0.8745432496070862, "num_tokens": 788232900.0, "step": 20656 }, { "epoch": 2.6277827248441676, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.55976104736328, "learning_rate": 1e-06, "loss": 0.631, "mean_token_accuracy": 0.8581777811050415, "num_tokens": 788273720.0, "step": 20657 }, { "epoch": 2.6279099351227577, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.71523666381836, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8651008605957031, "num_tokens": 788308425.0, "step": 20658 }, { "epoch": 2.6280371454013487, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.66449737548828, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8551779985427856, "num_tokens": 788343701.0, "step": 20659 }, { "epoch": 2.628164355679939, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.73694610595703, "learning_rate": 1e-06, "loss": 0.6791, "mean_token_accuracy": 0.8478896617889404, "num_tokens": 788372789.0, "step": 20660 }, { "epoch": 2.6282915659585293, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.53694152832031, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.868428647518158, "num_tokens": 788408422.0, "step": 20661 }, { "epoch": 2.62841877623712, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.95026397705078, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8664335012435913, "num_tokens": 788446113.0, "step": 20662 }, { "epoch": 2.6285459865157104, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.00336456298828, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.857658863067627, "num_tokens": 788485009.0, "step": 20663 }, { "epoch": 2.628673196794301, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.61598205566406, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8668617010116577, "num_tokens": 788523446.0, "step": 20664 }, { "epoch": 2.6288004070728914, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.68263626098633, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8731210231781006, "num_tokens": 788557423.0, "step": 20665 }, { "epoch": 2.628927617351482, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.57815933227539, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.8574542999267578, "num_tokens": 788588177.0, "step": 20666 }, { "epoch": 2.6290548276300725, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.607276916503906, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8646108508110046, "num_tokens": 788624877.0, "step": 20667 }, { "epoch": 2.629182037908663, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.659873962402344, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8726233243942261, "num_tokens": 788662532.0, "step": 20668 }, { "epoch": 2.6293092481872535, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.54179763793945, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8638260960578918, "num_tokens": 788702734.0, "step": 20669 }, { "epoch": 2.629436458465844, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.20390701293945, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.8622243404388428, "num_tokens": 788738041.0, "step": 20670 }, { "epoch": 2.6295636687444346, "ewc_loss": 0.203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.229827880859375, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8628935217857361, "num_tokens": 788772998.0, "step": 20671 }, { "epoch": 2.629690879023025, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.55320358276367, "learning_rate": 1e-06, "loss": 0.6632, "mean_token_accuracy": 0.8474114537239075, "num_tokens": 788814310.0, "step": 20672 }, { "epoch": 2.6298180893016156, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 50.82438659667969, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8673936724662781, "num_tokens": 788852811.0, "step": 20673 }, { "epoch": 2.629945299580206, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.551937103271484, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8707064390182495, "num_tokens": 788888756.0, "step": 20674 }, { "epoch": 2.6300725098587967, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.374412536621094, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8670451641082764, "num_tokens": 788926292.0, "step": 20675 }, { "epoch": 2.6301997201373872, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.698368072509766, "learning_rate": 1e-06, "loss": 0.572, "mean_token_accuracy": 0.8725047707557678, "num_tokens": 788965625.0, "step": 20676 }, { "epoch": 2.6303269304159778, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.71766662597656, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8683986067771912, "num_tokens": 789007390.0, "step": 20677 }, { "epoch": 2.6304541406945683, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.48102951049805, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8667087554931641, "num_tokens": 789047940.0, "step": 20678 }, { "epoch": 2.6305813509731584, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.46051025390625, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8640884757041931, "num_tokens": 789086321.0, "step": 20679 }, { "epoch": 2.6307085612517493, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.05699920654297, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8778390288352966, "num_tokens": 789121757.0, "step": 20680 }, { "epoch": 2.6308357715303394, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.58604431152344, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8771398067474365, "num_tokens": 789160076.0, "step": 20681 }, { "epoch": 2.6309629818089304, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.402687072753906, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8548300266265869, "num_tokens": 789197080.0, "step": 20682 }, { "epoch": 2.6310901920875205, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.93375015258789, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8572719097137451, "num_tokens": 789249876.0, "step": 20683 }, { "epoch": 2.6312174023661115, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.18608474731445, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8621537685394287, "num_tokens": 789290146.0, "step": 20684 }, { "epoch": 2.6313446126447015, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.186073303222656, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8675271272659302, "num_tokens": 789335610.0, "step": 20685 }, { "epoch": 2.631471822923292, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.127254486083984, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8579397797584534, "num_tokens": 789371956.0, "step": 20686 }, { "epoch": 2.6315990332018826, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.9241828918457, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8609554171562195, "num_tokens": 789415257.0, "step": 20687 }, { "epoch": 2.631726243480473, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.0565299987793, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8649545907974243, "num_tokens": 789452533.0, "step": 20688 }, { "epoch": 2.6318534537590637, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.86140441894531, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8593902587890625, "num_tokens": 789488780.0, "step": 20689 }, { "epoch": 2.631980664037654, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.2724494934082, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8609011769294739, "num_tokens": 789528169.0, "step": 20690 }, { "epoch": 2.6321078743162447, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.92817306518555, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8562042713165283, "num_tokens": 789568382.0, "step": 20691 }, { "epoch": 2.6322350845948352, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.28594207763672, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8600423336029053, "num_tokens": 789606895.0, "step": 20692 }, { "epoch": 2.6323622948734258, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.43508529663086, "learning_rate": 1e-06, "loss": 0.6592, "mean_token_accuracy": 0.8478003144264221, "num_tokens": 789649106.0, "step": 20693 }, { "epoch": 2.6324895051520163, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.972450256347656, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8677844405174255, "num_tokens": 789689449.0, "step": 20694 }, { "epoch": 2.632616715430607, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.66191864013672, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8746230602264404, "num_tokens": 789726179.0, "step": 20695 }, { "epoch": 2.6327439257091974, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.878814697265625, "learning_rate": 1e-06, "loss": 0.6592, "mean_token_accuracy": 0.8451225757598877, "num_tokens": 789764858.0, "step": 20696 }, { "epoch": 2.632871135987788, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.36906814575195, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8503729104995728, "num_tokens": 789799196.0, "step": 20697 }, { "epoch": 2.6329983462663784, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.08316421508789, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8620542287826538, "num_tokens": 789840498.0, "step": 20698 }, { "epoch": 2.633125556544969, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.263004302978516, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8673994541168213, "num_tokens": 789874269.0, "step": 20699 }, { "epoch": 2.6332527668235595, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.3397216796875, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8650588393211365, "num_tokens": 789914938.0, "step": 20700 }, { "epoch": 2.63337997710215, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.295257568359375, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8636342287063599, "num_tokens": 789959219.0, "step": 20701 }, { "epoch": 2.6335071873807405, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.249717712402344, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8578324913978577, "num_tokens": 789993707.0, "step": 20702 }, { "epoch": 2.633634397659331, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.05339431762695, "learning_rate": 1e-06, "loss": 0.6742, "mean_token_accuracy": 0.8427892923355103, "num_tokens": 790034285.0, "step": 20703 }, { "epoch": 2.633761607937921, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.61135482788086, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8662900924682617, "num_tokens": 790069714.0, "step": 20704 }, { "epoch": 2.633888818216512, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.16851806640625, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8619991540908813, "num_tokens": 790110733.0, "step": 20705 }, { "epoch": 2.634016028495102, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.27886199951172, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8673843145370483, "num_tokens": 790144468.0, "step": 20706 }, { "epoch": 2.634143238773693, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.832481384277344, "learning_rate": 1e-06, "loss": 0.6153, "mean_token_accuracy": 0.8608360290527344, "num_tokens": 790186881.0, "step": 20707 }, { "epoch": 2.6342704490522832, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.3115348815918, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8687229156494141, "num_tokens": 790225083.0, "step": 20708 }, { "epoch": 2.6343976593308738, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.19310760498047, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8636267781257629, "num_tokens": 790266822.0, "step": 20709 }, { "epoch": 2.6345248696094643, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.92098617553711, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8614530563354492, "num_tokens": 790305948.0, "step": 20710 }, { "epoch": 2.634652079888055, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.61500549316406, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8647820949554443, "num_tokens": 790341639.0, "step": 20711 }, { "epoch": 2.6347792901666454, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.37242889404297, "learning_rate": 1e-06, "loss": 0.5751, "mean_token_accuracy": 0.8715686798095703, "num_tokens": 790377442.0, "step": 20712 }, { "epoch": 2.634906500445236, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.34001159667969, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8615720868110657, "num_tokens": 790414911.0, "step": 20713 }, { "epoch": 2.6350337107238264, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.7341423034668, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8627668619155884, "num_tokens": 790453968.0, "step": 20714 }, { "epoch": 2.635160921002417, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.12660217285156, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.858897864818573, "num_tokens": 790493216.0, "step": 20715 }, { "epoch": 2.6352881312810075, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.48371124267578, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8584259748458862, "num_tokens": 790529380.0, "step": 20716 }, { "epoch": 2.635415341559598, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.74631118774414, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8590741157531738, "num_tokens": 790569568.0, "step": 20717 }, { "epoch": 2.6355425518381885, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.047115325927734, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.8686728477478027, "num_tokens": 790602269.0, "step": 20718 }, { "epoch": 2.635669762116779, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.62147521972656, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.851111650466919, "num_tokens": 790639723.0, "step": 20719 }, { "epoch": 2.6357969723953696, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.13068389892578, "learning_rate": 1e-06, "loss": 0.6604, "mean_token_accuracy": 0.848819375038147, "num_tokens": 790676103.0, "step": 20720 }, { "epoch": 2.63592418267396, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.71047592163086, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8652512431144714, "num_tokens": 790712118.0, "step": 20721 }, { "epoch": 2.6360513929525506, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.01256561279297, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8617398142814636, "num_tokens": 790747079.0, "step": 20722 }, { "epoch": 2.636178603231141, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.147544860839844, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8530640006065369, "num_tokens": 790781741.0, "step": 20723 }, { "epoch": 2.6363058135097317, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.99748611450195, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8618708252906799, "num_tokens": 790820243.0, "step": 20724 }, { "epoch": 2.6364330237883222, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.41788864135742, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8771623373031616, "num_tokens": 790851492.0, "step": 20725 }, { "epoch": 2.6365602340669128, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.13935089111328, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8687853813171387, "num_tokens": 790882293.0, "step": 20726 }, { "epoch": 2.636687444345503, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.58869552612305, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8587228059768677, "num_tokens": 790922188.0, "step": 20727 }, { "epoch": 2.636814654624094, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.026973724365234, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.8723114728927612, "num_tokens": 790960675.0, "step": 20728 }, { "epoch": 2.636941864902684, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.48450469970703, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8714885711669922, "num_tokens": 790999352.0, "step": 20729 }, { "epoch": 2.637069075181275, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.13661575317383, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8643655180931091, "num_tokens": 791035898.0, "step": 20730 }, { "epoch": 2.637196285459865, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.17782211303711, "learning_rate": 1e-06, "loss": 0.6894, "mean_token_accuracy": 0.8384872674942017, "num_tokens": 791079731.0, "step": 20731 }, { "epoch": 2.637323495738456, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.3203125, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8745691776275635, "num_tokens": 791118565.0, "step": 20732 }, { "epoch": 2.637450706017046, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.86011505126953, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.863695502281189, "num_tokens": 791154876.0, "step": 20733 }, { "epoch": 2.6375779162956365, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.61837387084961, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8618321418762207, "num_tokens": 791196003.0, "step": 20734 }, { "epoch": 2.637705126574227, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.31633758544922, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8617294430732727, "num_tokens": 791236462.0, "step": 20735 }, { "epoch": 2.6378323368528176, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.81759262084961, "learning_rate": 1e-06, "loss": 0.656, "mean_token_accuracy": 0.8498464226722717, "num_tokens": 791271918.0, "step": 20736 }, { "epoch": 2.637959547131408, "ewc_loss": 0.201171875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001773834228515625, "grad_norm": 50.92080307006836, "learning_rate": 1e-06, "loss": 0.5606, "mean_token_accuracy": 0.8748651146888733, "num_tokens": 791309813.0, "step": 20737 }, { "epoch": 2.6380867574099987, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.180850982666016, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8705801367759705, "num_tokens": 791338854.0, "step": 20738 }, { "epoch": 2.638213967688589, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.726806640625, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8691377639770508, "num_tokens": 791376580.0, "step": 20739 }, { "epoch": 2.6383411779671797, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.01897430419922, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8598403334617615, "num_tokens": 791415931.0, "step": 20740 }, { "epoch": 2.6384683882457702, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.11555862426758, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8592253923416138, "num_tokens": 791448201.0, "step": 20741 }, { "epoch": 2.6385955985243608, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.689884185791016, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.854884684085846, "num_tokens": 791489183.0, "step": 20742 }, { "epoch": 2.6387228088029513, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.16218185424805, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8638527989387512, "num_tokens": 791528414.0, "step": 20743 }, { "epoch": 2.638850019081542, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.13314437866211, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8819116353988647, "num_tokens": 791566379.0, "step": 20744 }, { "epoch": 2.6389772293601323, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.63611602783203, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8646423816680908, "num_tokens": 791599789.0, "step": 20745 }, { "epoch": 2.639104439638723, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.749874114990234, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8484220504760742, "num_tokens": 791642333.0, "step": 20746 }, { "epoch": 2.6392316499173134, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.87372589111328, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8651269674301147, "num_tokens": 791687190.0, "step": 20747 }, { "epoch": 2.639358860195904, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.33452606201172, "learning_rate": 1e-06, "loss": 0.6459, "mean_token_accuracy": 0.8483631610870361, "num_tokens": 791722806.0, "step": 20748 }, { "epoch": 2.6394860704744945, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.10408020019531, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8483319282531738, "num_tokens": 791757726.0, "step": 20749 }, { "epoch": 2.639613280753085, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.885066986083984, "learning_rate": 1e-06, "loss": 0.5897, "mean_token_accuracy": 0.8700567483901978, "num_tokens": 791795160.0, "step": 20750 }, { "epoch": 2.6397404910316755, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.32111358642578, "learning_rate": 1e-06, "loss": 0.5834, "mean_token_accuracy": 0.8751864433288574, "num_tokens": 791831682.0, "step": 20751 }, { "epoch": 2.6398677013102656, "ewc_loss": 0.2021484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00017833709716796875, "grad_norm": 51.25148391723633, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8733094334602356, "num_tokens": 791863230.0, "step": 20752 }, { "epoch": 2.6399949115888566, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.00938034057617, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8769657611846924, "num_tokens": 791904323.0, "step": 20753 }, { "epoch": 2.6401221218674467, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.25320053100586, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8550492525100708, "num_tokens": 791951836.0, "step": 20754 }, { "epoch": 2.6402493321460376, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.77853775024414, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8655031323432922, "num_tokens": 791989782.0, "step": 20755 }, { "epoch": 2.6403765424246277, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.87854766845703, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8649650812149048, "num_tokens": 792024038.0, "step": 20756 }, { "epoch": 2.6405037527032187, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.747371673583984, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8629103302955627, "num_tokens": 792067893.0, "step": 20757 }, { "epoch": 2.6406309629818088, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.543453216552734, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8730998039245605, "num_tokens": 792108593.0, "step": 20758 }, { "epoch": 2.6407581732603993, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.266990661621094, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8762214183807373, "num_tokens": 792141201.0, "step": 20759 }, { "epoch": 2.64088538353899, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.111610412597656, "learning_rate": 1e-06, "loss": 0.5499, "mean_token_accuracy": 0.8825697898864746, "num_tokens": 792182908.0, "step": 20760 }, { "epoch": 2.6410125938175804, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.684104919433594, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8646007180213928, "num_tokens": 792225223.0, "step": 20761 }, { "epoch": 2.641139804096171, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.94644546508789, "learning_rate": 1e-06, "loss": 0.6658, "mean_token_accuracy": 0.8464778661727905, "num_tokens": 792258131.0, "step": 20762 }, { "epoch": 2.6412670143747614, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.73451232910156, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8690042495727539, "num_tokens": 792291429.0, "step": 20763 }, { "epoch": 2.641394224653352, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.7301025390625, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8576923608779907, "num_tokens": 792327777.0, "step": 20764 }, { "epoch": 2.6415214349319425, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.153133392333984, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8667941093444824, "num_tokens": 792367918.0, "step": 20765 }, { "epoch": 2.641648645210533, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.22077941894531, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.868462085723877, "num_tokens": 792405550.0, "step": 20766 }, { "epoch": 2.6417758554891235, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.970672607421875, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8655626773834229, "num_tokens": 792440948.0, "step": 20767 }, { "epoch": 2.641903065767714, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.29740905761719, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8703407049179077, "num_tokens": 792488065.0, "step": 20768 }, { "epoch": 2.6420302760463046, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.43028259277344, "learning_rate": 1e-06, "loss": 0.575, "mean_token_accuracy": 0.8711099028587341, "num_tokens": 792522845.0, "step": 20769 }, { "epoch": 2.642157486324895, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.62176513671875, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8745681047439575, "num_tokens": 792561118.0, "step": 20770 }, { "epoch": 2.6422846966034856, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.19793701171875, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8629758358001709, "num_tokens": 792600125.0, "step": 20771 }, { "epoch": 2.642411906882076, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.503746032714844, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.867527961730957, "num_tokens": 792633856.0, "step": 20772 }, { "epoch": 2.6425391171606667, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.64682388305664, "learning_rate": 1e-06, "loss": 0.6307, "mean_token_accuracy": 0.8586845397949219, "num_tokens": 792672427.0, "step": 20773 }, { "epoch": 2.6426663274392572, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.28969955444336, "learning_rate": 1e-06, "loss": 0.6643, "mean_token_accuracy": 0.841052770614624, "num_tokens": 792713856.0, "step": 20774 }, { "epoch": 2.6427935377178478, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.266883850097656, "learning_rate": 1e-06, "loss": 0.6482, "mean_token_accuracy": 0.8555319905281067, "num_tokens": 792757602.0, "step": 20775 }, { "epoch": 2.6429207479964383, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.37723922729492, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.852384626865387, "num_tokens": 792797067.0, "step": 20776 }, { "epoch": 2.6430479582750284, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.498077392578125, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8672001361846924, "num_tokens": 792835748.0, "step": 20777 }, { "epoch": 2.6431751685536193, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.89484786987305, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8657812476158142, "num_tokens": 792875039.0, "step": 20778 }, { "epoch": 2.6433023788322094, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.349266052246094, "learning_rate": 1e-06, "loss": 0.586, "mean_token_accuracy": 0.869835615158081, "num_tokens": 792916580.0, "step": 20779 }, { "epoch": 2.6434295891108004, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.236820220947266, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8658117055892944, "num_tokens": 792952507.0, "step": 20780 }, { "epoch": 2.6435567993893905, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.35557556152344, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8656190037727356, "num_tokens": 792994046.0, "step": 20781 }, { "epoch": 2.6436840096679814, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.7635383605957, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.8834198713302612, "num_tokens": 793033855.0, "step": 20782 }, { "epoch": 2.6438112199465715, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.80122375488281, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8674166202545166, "num_tokens": 793072226.0, "step": 20783 }, { "epoch": 2.643938430225162, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.765953063964844, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8667854070663452, "num_tokens": 793110884.0, "step": 20784 }, { "epoch": 2.6440656405037526, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.05503463745117, "learning_rate": 1e-06, "loss": 0.6773, "mean_token_accuracy": 0.8469365835189819, "num_tokens": 793151885.0, "step": 20785 }, { "epoch": 2.644192850782343, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.36448669433594, "learning_rate": 1e-06, "loss": 0.675, "mean_token_accuracy": 0.8422980308532715, "num_tokens": 793187253.0, "step": 20786 }, { "epoch": 2.6443200610609336, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.19879150390625, "learning_rate": 1e-06, "loss": 0.5942, "mean_token_accuracy": 0.8688377141952515, "num_tokens": 793231249.0, "step": 20787 }, { "epoch": 2.644447271339524, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.1677360534668, "learning_rate": 1e-06, "loss": 0.5838, "mean_token_accuracy": 0.874708890914917, "num_tokens": 793269918.0, "step": 20788 }, { "epoch": 2.6445744816181147, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.61262512207031, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8731436729431152, "num_tokens": 793306856.0, "step": 20789 }, { "epoch": 2.6447016918967052, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.895442962646484, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.853050708770752, "num_tokens": 793347866.0, "step": 20790 }, { "epoch": 2.6448289021752958, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.679893493652344, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8616011142730713, "num_tokens": 793391037.0, "step": 20791 }, { "epoch": 2.6449561124538863, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.699058532714844, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8494585752487183, "num_tokens": 793423560.0, "step": 20792 }, { "epoch": 2.645083322732477, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.95746612548828, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8695659637451172, "num_tokens": 793460215.0, "step": 20793 }, { "epoch": 2.6452105330110673, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.628517150878906, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.8501135110855103, "num_tokens": 793492671.0, "step": 20794 }, { "epoch": 2.645337743289658, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.96991729736328, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8628883361816406, "num_tokens": 793529526.0, "step": 20795 }, { "epoch": 2.6454649535682484, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.801937103271484, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8724241852760315, "num_tokens": 793566748.0, "step": 20796 }, { "epoch": 2.645592163846839, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.496768951416016, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8767620921134949, "num_tokens": 793600703.0, "step": 20797 }, { "epoch": 2.6457193741254295, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.68547439575195, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8628193140029907, "num_tokens": 793639226.0, "step": 20798 }, { "epoch": 2.64584658440402, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.11931610107422, "learning_rate": 1e-06, "loss": 0.5645, "mean_token_accuracy": 0.8764355778694153, "num_tokens": 793676147.0, "step": 20799 }, { "epoch": 2.6459737946826105, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.7630500793457, "learning_rate": 1e-06, "loss": 0.5835, "mean_token_accuracy": 0.8754364252090454, "num_tokens": 793715778.0, "step": 20800 }, { "epoch": 2.646101004961201, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.787353515625, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8800923824310303, "num_tokens": 793752096.0, "step": 20801 }, { "epoch": 2.646228215239791, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.96847915649414, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8580927848815918, "num_tokens": 793789809.0, "step": 20802 }, { "epoch": 2.646355425518382, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.863807678222656, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8606693744659424, "num_tokens": 793826505.0, "step": 20803 }, { "epoch": 2.646482635796972, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.126319885253906, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8624367117881775, "num_tokens": 793868360.0, "step": 20804 }, { "epoch": 2.646609846075563, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.21973419189453, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8634449243545532, "num_tokens": 793906566.0, "step": 20805 }, { "epoch": 2.6467370563541532, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.84574890136719, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8596605062484741, "num_tokens": 793947814.0, "step": 20806 }, { "epoch": 2.6468642666327438, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.592437744140625, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8689914345741272, "num_tokens": 793987391.0, "step": 20807 }, { "epoch": 2.6469914769113343, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.27994918823242, "learning_rate": 1e-06, "loss": 0.5772, "mean_token_accuracy": 0.8743689656257629, "num_tokens": 794024045.0, "step": 20808 }, { "epoch": 2.647118687189925, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.685646057128906, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8679282665252686, "num_tokens": 794063405.0, "step": 20809 }, { "epoch": 2.6472458974685154, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.2049560546875, "learning_rate": 1e-06, "loss": 0.6788, "mean_token_accuracy": 0.8394476175308228, "num_tokens": 794099729.0, "step": 20810 }, { "epoch": 2.647373107747106, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.59416198730469, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.8557431697845459, "num_tokens": 794139669.0, "step": 20811 }, { "epoch": 2.6475003180256964, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.1727180480957, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8667646646499634, "num_tokens": 794176906.0, "step": 20812 }, { "epoch": 2.647627528304287, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.40959930419922, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8686292171478271, "num_tokens": 794216594.0, "step": 20813 }, { "epoch": 2.6477547385828775, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.268402099609375, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8680227994918823, "num_tokens": 794248379.0, "step": 20814 }, { "epoch": 2.647881948861468, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.13127136230469, "learning_rate": 1e-06, "loss": 0.6788, "mean_token_accuracy": 0.8397728204727173, "num_tokens": 794293604.0, "step": 20815 }, { "epoch": 2.6480091591400585, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.26988220214844, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8539040684700012, "num_tokens": 794332084.0, "step": 20816 }, { "epoch": 2.648136369418649, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.37467575073242, "learning_rate": 1e-06, "loss": 0.5588, "mean_token_accuracy": 0.879755973815918, "num_tokens": 794369158.0, "step": 20817 }, { "epoch": 2.6482635796972396, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.298744201660156, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8671820163726807, "num_tokens": 794410531.0, "step": 20818 }, { "epoch": 2.64839078997583, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.7633171081543, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8643129467964172, "num_tokens": 794457556.0, "step": 20819 }, { "epoch": 2.6485180002544206, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.416114807128906, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8532431125640869, "num_tokens": 794492737.0, "step": 20820 }, { "epoch": 2.648645210533011, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.639408111572266, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8740381002426147, "num_tokens": 794532589.0, "step": 20821 }, { "epoch": 2.6487724208116017, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.423431396484375, "learning_rate": 1e-06, "loss": 0.6448, "mean_token_accuracy": 0.8530613780021667, "num_tokens": 794566982.0, "step": 20822 }, { "epoch": 2.648899631090192, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.5313606262207, "learning_rate": 1e-06, "loss": 0.6634, "mean_token_accuracy": 0.8475560545921326, "num_tokens": 794604278.0, "step": 20823 }, { "epoch": 2.6490268413687827, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.8307991027832, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8738154768943787, "num_tokens": 794643785.0, "step": 20824 }, { "epoch": 2.649154051647373, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.720096588134766, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8719817399978638, "num_tokens": 794678302.0, "step": 20825 }, { "epoch": 2.649281261925964, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.05514144897461, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8543911576271057, "num_tokens": 794715732.0, "step": 20826 }, { "epoch": 2.649408472204554, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.69995880126953, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8681983947753906, "num_tokens": 794754359.0, "step": 20827 }, { "epoch": 2.649535682483145, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.41468811035156, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8618716597557068, "num_tokens": 794795770.0, "step": 20828 }, { "epoch": 2.649662892761735, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.5555534362793, "learning_rate": 1e-06, "loss": 0.6424, "mean_token_accuracy": 0.8569968938827515, "num_tokens": 794835654.0, "step": 20829 }, { "epoch": 2.649790103040326, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.169288635253906, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8657211661338806, "num_tokens": 794872527.0, "step": 20830 }, { "epoch": 2.649917313318916, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.843292236328125, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8684779405593872, "num_tokens": 794918577.0, "step": 20831 }, { "epoch": 2.6500445235975065, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.94483184814453, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8744103312492371, "num_tokens": 794953566.0, "step": 20832 }, { "epoch": 2.650171733876097, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.36537551879883, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8712855577468872, "num_tokens": 794990547.0, "step": 20833 }, { "epoch": 2.6502989441546876, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.11662673950195, "learning_rate": 1e-06, "loss": 0.5522, "mean_token_accuracy": 0.879968523979187, "num_tokens": 795028419.0, "step": 20834 }, { "epoch": 2.650426154433278, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.608360290527344, "learning_rate": 1e-06, "loss": 0.5596, "mean_token_accuracy": 0.8802967071533203, "num_tokens": 795064303.0, "step": 20835 }, { "epoch": 2.6505533647118686, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.814964294433594, "learning_rate": 1e-06, "loss": 0.6652, "mean_token_accuracy": 0.840900719165802, "num_tokens": 795104280.0, "step": 20836 }, { "epoch": 2.650680574990459, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.352630615234375, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.854844331741333, "num_tokens": 795142547.0, "step": 20837 }, { "epoch": 2.6508077852690497, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.913330078125, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8615251779556274, "num_tokens": 795186193.0, "step": 20838 }, { "epoch": 2.6509349955476402, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.767738342285156, "learning_rate": 1e-06, "loss": 0.5518, "mean_token_accuracy": 0.8821874260902405, "num_tokens": 795215453.0, "step": 20839 }, { "epoch": 2.6510622058262308, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.836700439453125, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.860102117061615, "num_tokens": 795252820.0, "step": 20840 }, { "epoch": 2.6511894161048213, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.769405364990234, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8613171577453613, "num_tokens": 795290000.0, "step": 20841 }, { "epoch": 2.651316626383412, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.242469787597656, "learning_rate": 1e-06, "loss": 0.5529, "mean_token_accuracy": 0.8801538944244385, "num_tokens": 795330389.0, "step": 20842 }, { "epoch": 2.6514438366620023, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.74382400512695, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8613458275794983, "num_tokens": 795366693.0, "step": 20843 }, { "epoch": 2.651571046940593, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.64715576171875, "learning_rate": 1e-06, "loss": 0.5961, "mean_token_accuracy": 0.8677085638046265, "num_tokens": 795405748.0, "step": 20844 }, { "epoch": 2.6516982572191834, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.81278991699219, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8749964237213135, "num_tokens": 795446179.0, "step": 20845 }, { "epoch": 2.651825467497774, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.51285934448242, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8580935001373291, "num_tokens": 795488234.0, "step": 20846 }, { "epoch": 2.6519526777763645, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.46146774291992, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8607150316238403, "num_tokens": 795526508.0, "step": 20847 }, { "epoch": 2.652079888054955, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.13834762573242, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.872248113155365, "num_tokens": 795566299.0, "step": 20848 }, { "epoch": 2.6522070983335455, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.00589370727539, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8601903319358826, "num_tokens": 795603892.0, "step": 20849 }, { "epoch": 2.6523343086121356, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.851722717285156, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8729041814804077, "num_tokens": 795639622.0, "step": 20850 }, { "epoch": 2.6524615188907266, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.97086715698242, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.8872770071029663, "num_tokens": 795677299.0, "step": 20851 }, { "epoch": 2.6525887291693167, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.298465728759766, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8638992309570312, "num_tokens": 795714304.0, "step": 20852 }, { "epoch": 2.6527159394479076, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.20799255371094, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8597642779350281, "num_tokens": 795752330.0, "step": 20853 }, { "epoch": 2.6528431497264977, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.85960388183594, "learning_rate": 1e-06, "loss": 0.6707, "mean_token_accuracy": 0.8409976363182068, "num_tokens": 795789503.0, "step": 20854 }, { "epoch": 2.6529703600050887, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.891292572021484, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8634481430053711, "num_tokens": 795828449.0, "step": 20855 }, { "epoch": 2.6530975702836788, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.15020751953125, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8643779754638672, "num_tokens": 795868454.0, "step": 20856 }, { "epoch": 2.6532247805622693, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.021366119384766, "learning_rate": 1e-06, "loss": 0.6706, "mean_token_accuracy": 0.8499362468719482, "num_tokens": 795904629.0, "step": 20857 }, { "epoch": 2.65335199084086, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.184574127197266, "learning_rate": 1e-06, "loss": 0.651, "mean_token_accuracy": 0.8499930500984192, "num_tokens": 795943197.0, "step": 20858 }, { "epoch": 2.6534792011194503, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.47797393798828, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.860439121723175, "num_tokens": 795984672.0, "step": 20859 }, { "epoch": 2.653606411398041, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.89536666870117, "learning_rate": 1e-06, "loss": 0.6714, "mean_token_accuracy": 0.8450434803962708, "num_tokens": 796022599.0, "step": 20860 }, { "epoch": 2.6537336216766314, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 52.08897018432617, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8623360991477966, "num_tokens": 796057798.0, "step": 20861 }, { "epoch": 2.653860831955222, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.40028381347656, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.862838625907898, "num_tokens": 796094928.0, "step": 20862 }, { "epoch": 2.6539880422338125, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.89840316772461, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.863805890083313, "num_tokens": 796132614.0, "step": 20863 }, { "epoch": 2.654115252512403, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.337669372558594, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8619167804718018, "num_tokens": 796171653.0, "step": 20864 }, { "epoch": 2.6542424627909935, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.79652404785156, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8581995964050293, "num_tokens": 796215218.0, "step": 20865 }, { "epoch": 2.654369673069584, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.306846618652344, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8747252225875854, "num_tokens": 796249165.0, "step": 20866 }, { "epoch": 2.6544968833481746, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.262550354003906, "learning_rate": 1e-06, "loss": 0.6252, "mean_token_accuracy": 0.8593195676803589, "num_tokens": 796283878.0, "step": 20867 }, { "epoch": 2.654624093626765, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.92027282714844, "learning_rate": 1e-06, "loss": 0.6566, "mean_token_accuracy": 0.851744532585144, "num_tokens": 796330760.0, "step": 20868 }, { "epoch": 2.6547513039053556, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.933414459228516, "learning_rate": 1e-06, "loss": 0.6612, "mean_token_accuracy": 0.8508880138397217, "num_tokens": 796371547.0, "step": 20869 }, { "epoch": 2.654878514183946, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.70528030395508, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8482300639152527, "num_tokens": 796409893.0, "step": 20870 }, { "epoch": 2.6550057244625367, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.5239372253418, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8489243984222412, "num_tokens": 796442493.0, "step": 20871 }, { "epoch": 2.655132934741127, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.84221649169922, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8640117645263672, "num_tokens": 796480853.0, "step": 20872 }, { "epoch": 2.6552601450197177, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.60454559326172, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8673767447471619, "num_tokens": 796520057.0, "step": 20873 }, { "epoch": 2.6553873552983083, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.537330627441406, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8657935261726379, "num_tokens": 796549779.0, "step": 20874 }, { "epoch": 2.6555145655768984, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.93117904663086, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8683966398239136, "num_tokens": 796590923.0, "step": 20875 }, { "epoch": 2.6556417758554893, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.99138641357422, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8596997857093811, "num_tokens": 796629455.0, "step": 20876 }, { "epoch": 2.6557689861340794, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.99101638793945, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8606687784194946, "num_tokens": 796669950.0, "step": 20877 }, { "epoch": 2.6558961964126704, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.7088508605957, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8520408868789673, "num_tokens": 796712263.0, "step": 20878 }, { "epoch": 2.6560234066912605, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.205902099609375, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8632931709289551, "num_tokens": 796753025.0, "step": 20879 }, { "epoch": 2.6561506169698514, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.14761734008789, "learning_rate": 1e-06, "loss": 0.5611, "mean_token_accuracy": 0.8756604194641113, "num_tokens": 796788810.0, "step": 20880 }, { "epoch": 2.6562778272484415, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.009029388427734, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8651562333106995, "num_tokens": 796826275.0, "step": 20881 }, { "epoch": 2.656405037527032, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.06940460205078, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.858076274394989, "num_tokens": 796859587.0, "step": 20882 }, { "epoch": 2.6565322478056226, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.07496643066406, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8689806461334229, "num_tokens": 796899440.0, "step": 20883 }, { "epoch": 2.656659458084213, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.901939392089844, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.853388786315918, "num_tokens": 796938703.0, "step": 20884 }, { "epoch": 2.6567866683628036, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.26103210449219, "learning_rate": 1e-06, "loss": 0.6618, "mean_token_accuracy": 0.8492664694786072, "num_tokens": 796982287.0, "step": 20885 }, { "epoch": 2.656913878641394, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.01227569580078, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8657671809196472, "num_tokens": 797017640.0, "step": 20886 }, { "epoch": 2.6570410889199847, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.07223892211914, "learning_rate": 1e-06, "loss": 0.6646, "mean_token_accuracy": 0.8454859256744385, "num_tokens": 797054050.0, "step": 20887 }, { "epoch": 2.6571682991985752, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.96052169799805, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8675541877746582, "num_tokens": 797092429.0, "step": 20888 }, { "epoch": 2.6572955094771658, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.896461486816406, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8599006533622742, "num_tokens": 797125646.0, "step": 20889 }, { "epoch": 2.6574227197557563, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.63959503173828, "learning_rate": 1e-06, "loss": 0.5945, "mean_token_accuracy": 0.8670468330383301, "num_tokens": 797164560.0, "step": 20890 }, { "epoch": 2.657549930034347, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.57752990722656, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8609350919723511, "num_tokens": 797194168.0, "step": 20891 }, { "epoch": 2.6576771403129373, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.19209289550781, "learning_rate": 1e-06, "loss": 0.5538, "mean_token_accuracy": 0.8835590481758118, "num_tokens": 797227045.0, "step": 20892 }, { "epoch": 2.657804350591528, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.57904815673828, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8782192468643188, "num_tokens": 797267607.0, "step": 20893 }, { "epoch": 2.6579315608701184, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.83753204345703, "learning_rate": 1e-06, "loss": 0.6464, "mean_token_accuracy": 0.8534926176071167, "num_tokens": 797309320.0, "step": 20894 }, { "epoch": 2.658058771148709, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.66128158569336, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8588956594467163, "num_tokens": 797348258.0, "step": 20895 }, { "epoch": 2.6581859814272994, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.65147018432617, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.86306232213974, "num_tokens": 797391563.0, "step": 20896 }, { "epoch": 2.65831319170589, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 52.14685821533203, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8711788058280945, "num_tokens": 797428314.0, "step": 20897 }, { "epoch": 2.6584404019844805, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.013343811035156, "learning_rate": 1e-06, "loss": 0.672, "mean_token_accuracy": 0.8426545858383179, "num_tokens": 797464480.0, "step": 20898 }, { "epoch": 2.658567612263071, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.29219055175781, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8639968633651733, "num_tokens": 797504135.0, "step": 20899 }, { "epoch": 2.658694822541661, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 50.94242858886719, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8665084838867188, "num_tokens": 797544222.0, "step": 20900 }, { "epoch": 2.658822032820252, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.57200241088867, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8693358898162842, "num_tokens": 797583695.0, "step": 20901 }, { "epoch": 2.658949243098842, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.21281814575195, "learning_rate": 1e-06, "loss": 0.6472, "mean_token_accuracy": 0.8483455181121826, "num_tokens": 797625660.0, "step": 20902 }, { "epoch": 2.659076453377433, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.082096099853516, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8612937331199646, "num_tokens": 797665807.0, "step": 20903 }, { "epoch": 2.6592036636560232, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.79177474975586, "learning_rate": 1e-06, "loss": 0.6688, "mean_token_accuracy": 0.8438201546669006, "num_tokens": 797707935.0, "step": 20904 }, { "epoch": 2.6593308739346138, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.19560241699219, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8554229140281677, "num_tokens": 797745502.0, "step": 20905 }, { "epoch": 2.6594580842132043, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.26875305175781, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8627985715866089, "num_tokens": 797779412.0, "step": 20906 }, { "epoch": 2.659585294491795, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.05320358276367, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8685610294342041, "num_tokens": 797816677.0, "step": 20907 }, { "epoch": 2.6597125047703853, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.0783576965332, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8447482585906982, "num_tokens": 797854260.0, "step": 20908 }, { "epoch": 2.659839715048976, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.11897659301758, "learning_rate": 1e-06, "loss": 0.6598, "mean_token_accuracy": 0.8509191870689392, "num_tokens": 797892349.0, "step": 20909 }, { "epoch": 2.6599669253275664, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.7939567565918, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8665099740028381, "num_tokens": 797928028.0, "step": 20910 }, { "epoch": 2.660094135606157, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 50.98086166381836, "learning_rate": 1e-06, "loss": 0.6572, "mean_token_accuracy": 0.8482579588890076, "num_tokens": 797963885.0, "step": 20911 }, { "epoch": 2.6602213458847475, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.39823532104492, "learning_rate": 1e-06, "loss": 0.6545, "mean_token_accuracy": 0.8481297492980957, "num_tokens": 798009842.0, "step": 20912 }, { "epoch": 2.660348556163338, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 50.74726104736328, "learning_rate": 1e-06, "loss": 0.5629, "mean_token_accuracy": 0.8744342923164368, "num_tokens": 798043207.0, "step": 20913 }, { "epoch": 2.6604757664419285, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.4144287109375, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8769030570983887, "num_tokens": 798080887.0, "step": 20914 }, { "epoch": 2.660602976720519, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.56547927856445, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.872532844543457, "num_tokens": 798126393.0, "step": 20915 }, { "epoch": 2.6607301869991096, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.79587936401367, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8789372444152832, "num_tokens": 798166099.0, "step": 20916 }, { "epoch": 2.6608573972777, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.07318115234375, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8571330904960632, "num_tokens": 798202285.0, "step": 20917 }, { "epoch": 2.6609846075562906, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.9927978515625, "learning_rate": 1e-06, "loss": 0.5874, "mean_token_accuracy": 0.8717019557952881, "num_tokens": 798243463.0, "step": 20918 }, { "epoch": 2.661111817834881, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.333255767822266, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8651846051216125, "num_tokens": 798275507.0, "step": 20919 }, { "epoch": 2.6612390281134717, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.642398834228516, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8820062875747681, "num_tokens": 798310536.0, "step": 20920 }, { "epoch": 2.661366238392062, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.99489212036133, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.858479917049408, "num_tokens": 798347874.0, "step": 20921 }, { "epoch": 2.6614934486706527, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.95277404785156, "learning_rate": 1e-06, "loss": 0.5384, "mean_token_accuracy": 0.8877485990524292, "num_tokens": 798381865.0, "step": 20922 }, { "epoch": 2.661620658949243, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.205848693847656, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.861034631729126, "num_tokens": 798423378.0, "step": 20923 }, { "epoch": 2.661747869227834, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.39943313598633, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.869925856590271, "num_tokens": 798460278.0, "step": 20924 }, { "epoch": 2.661875079506424, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.77231216430664, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8635598421096802, "num_tokens": 798496622.0, "step": 20925 }, { "epoch": 2.662002289785015, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.26600646972656, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8575832843780518, "num_tokens": 798533061.0, "step": 20926 }, { "epoch": 2.662129500063605, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.29761505126953, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8565493822097778, "num_tokens": 798573689.0, "step": 20927 }, { "epoch": 2.662256710342196, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.93189239501953, "learning_rate": 1e-06, "loss": 0.5773, "mean_token_accuracy": 0.8726423382759094, "num_tokens": 798614058.0, "step": 20928 }, { "epoch": 2.662383920620786, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.93684005737305, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8535668253898621, "num_tokens": 798657769.0, "step": 20929 }, { "epoch": 2.6625111308993765, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.85930252075195, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8660916686058044, "num_tokens": 798701084.0, "step": 20930 }, { "epoch": 2.662638341177967, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.11481475830078, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8674854636192322, "num_tokens": 798738029.0, "step": 20931 }, { "epoch": 2.6627655514565576, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.05023956298828, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8588172197341919, "num_tokens": 798773537.0, "step": 20932 }, { "epoch": 2.662892761735148, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.18379592895508, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8611513376235962, "num_tokens": 798810397.0, "step": 20933 }, { "epoch": 2.6630199720137386, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.92952346801758, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.8598378896713257, "num_tokens": 798850796.0, "step": 20934 }, { "epoch": 2.663147182292329, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.437496185302734, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8637698888778687, "num_tokens": 798892379.0, "step": 20935 }, { "epoch": 2.6632743925709197, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.56793975830078, "learning_rate": 1e-06, "loss": 0.6498, "mean_token_accuracy": 0.8506150245666504, "num_tokens": 798929444.0, "step": 20936 }, { "epoch": 2.66340160284951, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.03126525878906, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8691335916519165, "num_tokens": 798964086.0, "step": 20937 }, { "epoch": 2.6635288131281007, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.10740280151367, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8709021806716919, "num_tokens": 799007408.0, "step": 20938 }, { "epoch": 2.6636560234066913, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.29359436035156, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8668584823608398, "num_tokens": 799045930.0, "step": 20939 }, { "epoch": 2.663783233685282, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.338836669921875, "learning_rate": 1e-06, "loss": 0.5803, "mean_token_accuracy": 0.8742703199386597, "num_tokens": 799083568.0, "step": 20940 }, { "epoch": 2.6639104439638723, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.97447967529297, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8592286109924316, "num_tokens": 799123681.0, "step": 20941 }, { "epoch": 2.664037654242463, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.103633880615234, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.858805775642395, "num_tokens": 799164623.0, "step": 20942 }, { "epoch": 2.6641648645210534, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.05434036254883, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8623015284538269, "num_tokens": 799200408.0, "step": 20943 }, { "epoch": 2.664292074799644, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.50286865234375, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8523584008216858, "num_tokens": 799245106.0, "step": 20944 }, { "epoch": 2.6644192850782344, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.29991912841797, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8700628280639648, "num_tokens": 799290429.0, "step": 20945 }, { "epoch": 2.664546495356825, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.49336624145508, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.860685408115387, "num_tokens": 799322951.0, "step": 20946 }, { "epoch": 2.6646737056354155, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.155738830566406, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8600404858589172, "num_tokens": 799365498.0, "step": 20947 }, { "epoch": 2.6648009159140056, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.61344528198242, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.8568419814109802, "num_tokens": 799405894.0, "step": 20948 }, { "epoch": 2.6649281261925966, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.88772201538086, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8613401055335999, "num_tokens": 799439750.0, "step": 20949 }, { "epoch": 2.6650553364711866, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.546756744384766, "learning_rate": 1e-06, "loss": 0.6434, "mean_token_accuracy": 0.8554650545120239, "num_tokens": 799475170.0, "step": 20950 }, { "epoch": 2.6651825467497776, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.57762145996094, "learning_rate": 1e-06, "loss": 0.6639, "mean_token_accuracy": 0.8494559526443481, "num_tokens": 799514231.0, "step": 20951 }, { "epoch": 2.6653097570283677, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.91427993774414, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8630378842353821, "num_tokens": 799549512.0, "step": 20952 }, { "epoch": 2.6654369673069587, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.02342987060547, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8572788238525391, "num_tokens": 799592144.0, "step": 20953 }, { "epoch": 2.6655641775855488, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.93546676635742, "learning_rate": 1e-06, "loss": 0.662, "mean_token_accuracy": 0.8476094007492065, "num_tokens": 799634779.0, "step": 20954 }, { "epoch": 2.6656913878641393, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.31308364868164, "learning_rate": 1e-06, "loss": 0.5723, "mean_token_accuracy": 0.8750079870223999, "num_tokens": 799677548.0, "step": 20955 }, { "epoch": 2.66581859814273, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.92882537841797, "learning_rate": 1e-06, "loss": 0.6768, "mean_token_accuracy": 0.84473717212677, "num_tokens": 799715679.0, "step": 20956 }, { "epoch": 2.6659458084213203, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.24887466430664, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8560164570808411, "num_tokens": 799755701.0, "step": 20957 }, { "epoch": 2.666073018699911, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.11696243286133, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8588628172874451, "num_tokens": 799795151.0, "step": 20958 }, { "epoch": 2.6662002289785014, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.159576416015625, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8670739531517029, "num_tokens": 799832249.0, "step": 20959 }, { "epoch": 2.666327439257092, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.57289505004883, "learning_rate": 1e-06, "loss": 0.6704, "mean_token_accuracy": 0.8393434286117554, "num_tokens": 799867762.0, "step": 20960 }, { "epoch": 2.6664546495356825, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.86969757080078, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8594501614570618, "num_tokens": 799909012.0, "step": 20961 }, { "epoch": 2.666581859814273, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.06875228881836, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.858237624168396, "num_tokens": 799950316.0, "step": 20962 }, { "epoch": 2.6667090700928635, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.12102508544922, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8769084215164185, "num_tokens": 799990037.0, "step": 20963 }, { "epoch": 2.666836280371454, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.69139862060547, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8724328875541687, "num_tokens": 800023935.0, "step": 20964 }, { "epoch": 2.6669634906500446, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.05101013183594, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8687450289726257, "num_tokens": 800066357.0, "step": 20965 }, { "epoch": 2.667090700928635, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.629791259765625, "learning_rate": 1e-06, "loss": 0.6624, "mean_token_accuracy": 0.8479999303817749, "num_tokens": 800100667.0, "step": 20966 }, { "epoch": 2.6672179112072256, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.641204833984375, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8682348728179932, "num_tokens": 800135101.0, "step": 20967 }, { "epoch": 2.667345121485816, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.15675735473633, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8783407211303711, "num_tokens": 800176308.0, "step": 20968 }, { "epoch": 2.6674723317644067, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.861331939697266, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.873773455619812, "num_tokens": 800205585.0, "step": 20969 }, { "epoch": 2.667599542042997, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.48551559448242, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8608644008636475, "num_tokens": 800241400.0, "step": 20970 }, { "epoch": 2.6677267523215877, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.1325798034668, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8703969120979309, "num_tokens": 800279060.0, "step": 20971 }, { "epoch": 2.6678539626001783, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.59739685058594, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8714140057563782, "num_tokens": 800318606.0, "step": 20972 }, { "epoch": 2.6679811728787683, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.9775276184082, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8626394271850586, "num_tokens": 800356795.0, "step": 20973 }, { "epoch": 2.6681083831573593, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.381996154785156, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.8717671632766724, "num_tokens": 800393786.0, "step": 20974 }, { "epoch": 2.6682355934359494, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.725616455078125, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8609788417816162, "num_tokens": 800440153.0, "step": 20975 }, { "epoch": 2.6683628037145404, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.4769172668457, "learning_rate": 1e-06, "loss": 0.6727, "mean_token_accuracy": 0.8445336818695068, "num_tokens": 800476343.0, "step": 20976 }, { "epoch": 2.6684900139931305, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.8653564453125, "learning_rate": 1e-06, "loss": 0.5719, "mean_token_accuracy": 0.8770239949226379, "num_tokens": 800514227.0, "step": 20977 }, { "epoch": 2.668617224271721, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.37262725830078, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8616201877593994, "num_tokens": 800552357.0, "step": 20978 }, { "epoch": 2.6687444345503115, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.06850051879883, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8723474740982056, "num_tokens": 800587624.0, "step": 20979 }, { "epoch": 2.668871644828902, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.94001770019531, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8568456768989563, "num_tokens": 800631183.0, "step": 20980 }, { "epoch": 2.6689988551074926, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.071510314941406, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8654327988624573, "num_tokens": 800670334.0, "step": 20981 }, { "epoch": 2.669126065386083, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.745235443115234, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8583618402481079, "num_tokens": 800703500.0, "step": 20982 }, { "epoch": 2.6692532756646736, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.656166076660156, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8554359078407288, "num_tokens": 800742904.0, "step": 20983 }, { "epoch": 2.669380485943264, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.745914459228516, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8601899743080139, "num_tokens": 800777556.0, "step": 20984 }, { "epoch": 2.6695076962218547, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.67478561401367, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8536509871482849, "num_tokens": 800817532.0, "step": 20985 }, { "epoch": 2.669634906500445, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 52.04814529418945, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8731154799461365, "num_tokens": 800856567.0, "step": 20986 }, { "epoch": 2.6697621167790357, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.564022064208984, "learning_rate": 1e-06, "loss": 0.6794, "mean_token_accuracy": 0.8400650024414062, "num_tokens": 800901234.0, "step": 20987 }, { "epoch": 2.6698893270576263, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.15679931640625, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8592966794967651, "num_tokens": 800942647.0, "step": 20988 }, { "epoch": 2.670016537336217, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.699153900146484, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8715484142303467, "num_tokens": 800980194.0, "step": 20989 }, { "epoch": 2.6701437476148073, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.35129165649414, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8680466413497925, "num_tokens": 801016978.0, "step": 20990 }, { "epoch": 2.670270957893398, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.881290435791016, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.8483009338378906, "num_tokens": 801051921.0, "step": 20991 }, { "epoch": 2.6703981681719884, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.50558090209961, "learning_rate": 1e-06, "loss": 0.6806, "mean_token_accuracy": 0.8357473611831665, "num_tokens": 801087189.0, "step": 20992 }, { "epoch": 2.670525378450579, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.793495178222656, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8688664436340332, "num_tokens": 801121574.0, "step": 20993 }, { "epoch": 2.6706525887291694, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.348655700683594, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8680594563484192, "num_tokens": 801160497.0, "step": 20994 }, { "epoch": 2.67077979900776, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.88588333129883, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8645626306533813, "num_tokens": 801198556.0, "step": 20995 }, { "epoch": 2.6709070092863505, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.573726654052734, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.889593243598938, "num_tokens": 801230616.0, "step": 20996 }, { "epoch": 2.671034219564941, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.971412658691406, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8616454601287842, "num_tokens": 801266582.0, "step": 20997 }, { "epoch": 2.671161429843531, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.97721862792969, "learning_rate": 1e-06, "loss": 0.5789, "mean_token_accuracy": 0.874793291091919, "num_tokens": 801305127.0, "step": 20998 }, { "epoch": 2.671288640122122, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.97140884399414, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8671008348464966, "num_tokens": 801348284.0, "step": 20999 }, { "epoch": 2.671415850400712, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.135074615478516, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8831453919410706, "num_tokens": 801385298.0, "step": 21000 }, { "epoch": 2.671543060679303, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.384971618652344, "learning_rate": 1e-06, "loss": 0.6548, "mean_token_accuracy": 0.855400800704956, "num_tokens": 801417020.0, "step": 21001 }, { "epoch": 2.6716702709578932, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 50.452457427978516, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8578375577926636, "num_tokens": 801455328.0, "step": 21002 }, { "epoch": 2.6717974812364838, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.86540603637695, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8727941513061523, "num_tokens": 801494342.0, "step": 21003 }, { "epoch": 2.6719246915150743, "ewc_loss": 0.203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000179290771484375, "grad_norm": 51.128570556640625, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8655363321304321, "num_tokens": 801528182.0, "step": 21004 }, { "epoch": 2.672051901793665, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.6765251159668, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.861337423324585, "num_tokens": 801569424.0, "step": 21005 }, { "epoch": 2.6721791120722553, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.26832580566406, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8614312410354614, "num_tokens": 801608809.0, "step": 21006 }, { "epoch": 2.672306322350846, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.03571701049805, "learning_rate": 1e-06, "loss": 0.6475, "mean_token_accuracy": 0.8545417785644531, "num_tokens": 801651910.0, "step": 21007 }, { "epoch": 2.6724335326294364, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.828250885009766, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8633133172988892, "num_tokens": 801692961.0, "step": 21008 }, { "epoch": 2.672560742908027, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.25647735595703, "learning_rate": 1e-06, "loss": 0.6636, "mean_token_accuracy": 0.8501272201538086, "num_tokens": 801731278.0, "step": 21009 }, { "epoch": 2.6726879531866174, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.74991989135742, "learning_rate": 1e-06, "loss": 0.6287, "mean_token_accuracy": 0.8582817912101746, "num_tokens": 801769562.0, "step": 21010 }, { "epoch": 2.672815163465208, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.15770721435547, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8598062992095947, "num_tokens": 801811046.0, "step": 21011 }, { "epoch": 2.6729423737437985, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.25973892211914, "learning_rate": 1e-06, "loss": 0.679, "mean_token_accuracy": 0.8389635682106018, "num_tokens": 801847833.0, "step": 21012 }, { "epoch": 2.673069584022389, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.30104064941406, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8571205735206604, "num_tokens": 801887819.0, "step": 21013 }, { "epoch": 2.6731967943009796, "ewc_loss": 0.2041015625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018024444580078125, "grad_norm": 51.45951461791992, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8755844831466675, "num_tokens": 801922535.0, "step": 21014 }, { "epoch": 2.67332400457957, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.475460052490234, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8669937252998352, "num_tokens": 801958025.0, "step": 21015 }, { "epoch": 2.6734512148581606, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.28896713256836, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8654358983039856, "num_tokens": 801991291.0, "step": 21016 }, { "epoch": 2.673578425136751, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.081687927246094, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8648470640182495, "num_tokens": 802024952.0, "step": 21017 }, { "epoch": 2.6737056354153417, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.79314041137695, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8607593178749084, "num_tokens": 802065741.0, "step": 21018 }, { "epoch": 2.673832845693932, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.302040100097656, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8614011406898499, "num_tokens": 802107153.0, "step": 21019 }, { "epoch": 2.6739600559725227, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.99766540527344, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8665977716445923, "num_tokens": 802148177.0, "step": 21020 }, { "epoch": 2.674087266251113, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.81307601928711, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8639577627182007, "num_tokens": 802186144.0, "step": 21021 }, { "epoch": 2.674214476529704, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.86933135986328, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8777803182601929, "num_tokens": 802224054.0, "step": 21022 }, { "epoch": 2.674341686808294, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.267765045166016, "learning_rate": 1e-06, "loss": 0.6693, "mean_token_accuracy": 0.8471936583518982, "num_tokens": 802264483.0, "step": 21023 }, { "epoch": 2.674468897086885, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.124183654785156, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.870703935623169, "num_tokens": 802302933.0, "step": 21024 }, { "epoch": 2.674596107365475, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.41911315917969, "learning_rate": 1e-06, "loss": 0.6408, "mean_token_accuracy": 0.8561028838157654, "num_tokens": 802347354.0, "step": 21025 }, { "epoch": 2.674723317644066, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.25794982910156, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8730683326721191, "num_tokens": 802392159.0, "step": 21026 }, { "epoch": 2.674850527922656, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.11211013793945, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.8658638000488281, "num_tokens": 802432629.0, "step": 21027 }, { "epoch": 2.6749777382012465, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.80073165893555, "learning_rate": 1e-06, "loss": 0.5822, "mean_token_accuracy": 0.8729010820388794, "num_tokens": 802471776.0, "step": 21028 }, { "epoch": 2.675104948479837, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.14702606201172, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.8561077117919922, "num_tokens": 802507314.0, "step": 21029 }, { "epoch": 2.6752321587584276, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.68477249145508, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8715376257896423, "num_tokens": 802550587.0, "step": 21030 }, { "epoch": 2.675359369037018, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.338775634765625, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.8630246520042419, "num_tokens": 802591544.0, "step": 21031 }, { "epoch": 2.6754865793156086, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.742862701416016, "learning_rate": 1e-06, "loss": 0.5758, "mean_token_accuracy": 0.8711875677108765, "num_tokens": 802627336.0, "step": 21032 }, { "epoch": 2.675613789594199, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.508399963378906, "learning_rate": 1e-06, "loss": 0.6273, "mean_token_accuracy": 0.8629356622695923, "num_tokens": 802668358.0, "step": 21033 }, { "epoch": 2.6757409998727897, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.0460090637207, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.871872067451477, "num_tokens": 802708023.0, "step": 21034 }, { "epoch": 2.67586821015138, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.161380767822266, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8845593929290771, "num_tokens": 802745856.0, "step": 21035 }, { "epoch": 2.6759954204299707, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.973777770996094, "learning_rate": 1e-06, "loss": 0.582, "mean_token_accuracy": 0.8744706511497498, "num_tokens": 802779032.0, "step": 21036 }, { "epoch": 2.6761226307085613, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.1067008972168, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8635830879211426, "num_tokens": 802815736.0, "step": 21037 }, { "epoch": 2.676249840987152, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.53870391845703, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8593578338623047, "num_tokens": 802854133.0, "step": 21038 }, { "epoch": 2.6763770512657423, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.60659408569336, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.870335042476654, "num_tokens": 802892969.0, "step": 21039 }, { "epoch": 2.676504261544333, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.350791931152344, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8788168430328369, "num_tokens": 802936998.0, "step": 21040 }, { "epoch": 2.6766314718229234, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.61827850341797, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8722886443138123, "num_tokens": 802979040.0, "step": 21041 }, { "epoch": 2.676758682101514, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.286258697509766, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8723870515823364, "num_tokens": 803016516.0, "step": 21042 }, { "epoch": 2.6768858923801044, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.14915466308594, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8610521554946899, "num_tokens": 803055566.0, "step": 21043 }, { "epoch": 2.677013102658695, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.281131744384766, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8654629588127136, "num_tokens": 803093683.0, "step": 21044 }, { "epoch": 2.6771403129372855, "ewc_loss": 0.21875, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.27946090698242, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.871649980545044, "num_tokens": 803133691.0, "step": 21045 }, { "epoch": 2.6772675232158756, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.828407287597656, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8723708987236023, "num_tokens": 803169870.0, "step": 21046 }, { "epoch": 2.6773947334944665, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.39030075073242, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8567672371864319, "num_tokens": 803209038.0, "step": 21047 }, { "epoch": 2.6775219437730566, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.81227493286133, "learning_rate": 1e-06, "loss": 0.6402, "mean_token_accuracy": 0.849961519241333, "num_tokens": 803239725.0, "step": 21048 }, { "epoch": 2.6776491540516476, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.83961868286133, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8632615804672241, "num_tokens": 803273580.0, "step": 21049 }, { "epoch": 2.6777763643302377, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.35777282714844, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8702811002731323, "num_tokens": 803313583.0, "step": 21050 }, { "epoch": 2.6779035746088287, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.97842788696289, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.870695948600769, "num_tokens": 803347517.0, "step": 21051 }, { "epoch": 2.6780307848874187, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.32585144042969, "learning_rate": 1e-06, "loss": 0.6498, "mean_token_accuracy": 0.8503818511962891, "num_tokens": 803388183.0, "step": 21052 }, { "epoch": 2.6781579951660093, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.16926574707031, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8649657964706421, "num_tokens": 803428377.0, "step": 21053 }, { "epoch": 2.6782852054446, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.12466049194336, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8737167119979858, "num_tokens": 803460930.0, "step": 21054 }, { "epoch": 2.6784124157231903, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.54660415649414, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8723649978637695, "num_tokens": 803497971.0, "step": 21055 }, { "epoch": 2.678539626001781, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.30948257446289, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.8795313239097595, "num_tokens": 803535514.0, "step": 21056 }, { "epoch": 2.6786668362803714, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.577266693115234, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8580923080444336, "num_tokens": 803580767.0, "step": 21057 }, { "epoch": 2.678794046558962, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.58681106567383, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8655673265457153, "num_tokens": 803619168.0, "step": 21058 }, { "epoch": 2.6789212568375524, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.381591796875, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8485723733901978, "num_tokens": 803654785.0, "step": 21059 }, { "epoch": 2.679048467116143, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.27217483520508, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.873450756072998, "num_tokens": 803692505.0, "step": 21060 }, { "epoch": 2.6791756773947335, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.82695770263672, "learning_rate": 1e-06, "loss": 0.5787, "mean_token_accuracy": 0.871077299118042, "num_tokens": 803732592.0, "step": 21061 }, { "epoch": 2.679302887673324, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.3476676940918, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8521853685379028, "num_tokens": 803767601.0, "step": 21062 }, { "epoch": 2.6794300979519146, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.842227935791016, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8560354709625244, "num_tokens": 803804329.0, "step": 21063 }, { "epoch": 2.679557308230505, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.3867073059082, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.858736515045166, "num_tokens": 803846182.0, "step": 21064 }, { "epoch": 2.6796845185090956, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.5511589050293, "learning_rate": 1e-06, "loss": 0.6366, "mean_token_accuracy": 0.859512448310852, "num_tokens": 803883561.0, "step": 21065 }, { "epoch": 2.679811728787686, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.16688537597656, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.883691668510437, "num_tokens": 803916860.0, "step": 21066 }, { "epoch": 2.6799389390662767, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.88066101074219, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8629463911056519, "num_tokens": 803951949.0, "step": 21067 }, { "epoch": 2.680066149344867, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.92117691040039, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.857811689376831, "num_tokens": 803992094.0, "step": 21068 }, { "epoch": 2.6801933596234577, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.61558151245117, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8625702261924744, "num_tokens": 804034058.0, "step": 21069 }, { "epoch": 2.6803205699020483, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.703304290771484, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8548895120620728, "num_tokens": 804072722.0, "step": 21070 }, { "epoch": 2.6804477801806383, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.13787841796875, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8706163167953491, "num_tokens": 804114253.0, "step": 21071 }, { "epoch": 2.6805749904592293, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.94392013549805, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8668864369392395, "num_tokens": 804154877.0, "step": 21072 }, { "epoch": 2.6807022007378194, "ewc_loss": 0.205078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001811981201171875, "grad_norm": 51.19211959838867, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8578628897666931, "num_tokens": 804191060.0, "step": 21073 }, { "epoch": 2.6808294110164104, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.69684600830078, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.8552697896957397, "num_tokens": 804231574.0, "step": 21074 }, { "epoch": 2.6809566212950005, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.99627685546875, "learning_rate": 1e-06, "loss": 0.6685, "mean_token_accuracy": 0.84214186668396, "num_tokens": 804264027.0, "step": 21075 }, { "epoch": 2.681083831573591, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.70026397705078, "learning_rate": 1e-06, "loss": 0.6097, "mean_token_accuracy": 0.8656505942344666, "num_tokens": 804302906.0, "step": 21076 }, { "epoch": 2.6812110418521815, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.228641510009766, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8584423661231995, "num_tokens": 804339107.0, "step": 21077 }, { "epoch": 2.681338252130772, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.689456939697266, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8670620918273926, "num_tokens": 804374292.0, "step": 21078 }, { "epoch": 2.6814654624093626, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.74850082397461, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8668795824050903, "num_tokens": 804420287.0, "step": 21079 }, { "epoch": 2.681592672687953, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.85415267944336, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8699292540550232, "num_tokens": 804453977.0, "step": 21080 }, { "epoch": 2.6817198829665436, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.72795867919922, "learning_rate": 1e-06, "loss": 0.6733, "mean_token_accuracy": 0.8484228849411011, "num_tokens": 804494958.0, "step": 21081 }, { "epoch": 2.681847093245134, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.49839782714844, "learning_rate": 1e-06, "loss": 0.6659, "mean_token_accuracy": 0.8435177803039551, "num_tokens": 804532209.0, "step": 21082 }, { "epoch": 2.6819743035237247, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.52189254760742, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8662348389625549, "num_tokens": 804572937.0, "step": 21083 }, { "epoch": 2.682101513802315, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.44469451904297, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8635303378105164, "num_tokens": 804610263.0, "step": 21084 }, { "epoch": 2.6822287240809057, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.78089141845703, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8675234317779541, "num_tokens": 804645516.0, "step": 21085 }, { "epoch": 2.6823559343594963, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.485145568847656, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.8754464387893677, "num_tokens": 804684540.0, "step": 21086 }, { "epoch": 2.682483144638087, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.60413360595703, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8649747371673584, "num_tokens": 804723030.0, "step": 21087 }, { "epoch": 2.6826103549166773, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.36819839477539, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8717681169509888, "num_tokens": 804759202.0, "step": 21088 }, { "epoch": 2.682737565195268, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.096256256103516, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8613677024841309, "num_tokens": 804803199.0, "step": 21089 }, { "epoch": 2.6828647754738584, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.65864562988281, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8613660335540771, "num_tokens": 804835675.0, "step": 21090 }, { "epoch": 2.682991985752449, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.0130615234375, "learning_rate": 1e-06, "loss": 0.6566, "mean_token_accuracy": 0.8501385450363159, "num_tokens": 804876583.0, "step": 21091 }, { "epoch": 2.6831191960310394, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.77650833129883, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8538159728050232, "num_tokens": 804906110.0, "step": 21092 }, { "epoch": 2.68324640630963, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.78594207763672, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.855080783367157, "num_tokens": 804942196.0, "step": 21093 }, { "epoch": 2.6833736165882205, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.93603515625, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8627740144729614, "num_tokens": 804978760.0, "step": 21094 }, { "epoch": 2.683500826866811, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.640594482421875, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8545989990234375, "num_tokens": 805014212.0, "step": 21095 }, { "epoch": 2.683628037145401, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.88797378540039, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.886512041091919, "num_tokens": 805050528.0, "step": 21096 }, { "epoch": 2.683755247423992, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.57014846801758, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8665383458137512, "num_tokens": 805087355.0, "step": 21097 }, { "epoch": 2.683882457702582, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.063961029052734, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8510533571243286, "num_tokens": 805117837.0, "step": 21098 }, { "epoch": 2.684009667981173, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.742008209228516, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8626230955123901, "num_tokens": 805157734.0, "step": 21099 }, { "epoch": 2.684136878259763, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.33946228027344, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8703790903091431, "num_tokens": 805190657.0, "step": 21100 }, { "epoch": 2.6842640885383537, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.291748046875, "learning_rate": 1e-06, "loss": 0.6499, "mean_token_accuracy": 0.8537575006484985, "num_tokens": 805229654.0, "step": 21101 }, { "epoch": 2.6843912988169443, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.02273941040039, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8631012439727783, "num_tokens": 805269199.0, "step": 21102 }, { "epoch": 2.684518509095535, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.5150260925293, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8573522567749023, "num_tokens": 805308918.0, "step": 21103 }, { "epoch": 2.6846457193741253, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.9123649597168, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8598335385322571, "num_tokens": 805345014.0, "step": 21104 }, { "epoch": 2.684772929652716, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.323909759521484, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.8759326338768005, "num_tokens": 805383674.0, "step": 21105 }, { "epoch": 2.6849001399313064, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.14491653442383, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8663239479064941, "num_tokens": 805419765.0, "step": 21106 }, { "epoch": 2.685027350209897, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.87880325317383, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8643527626991272, "num_tokens": 805461346.0, "step": 21107 }, { "epoch": 2.6851545604884874, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.703712463378906, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8711429834365845, "num_tokens": 805494519.0, "step": 21108 }, { "epoch": 2.685281770767078, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.927921295166016, "learning_rate": 1e-06, "loss": 0.6556, "mean_token_accuracy": 0.8521559238433838, "num_tokens": 805530542.0, "step": 21109 }, { "epoch": 2.6854089810456685, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.27162170410156, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8597840070724487, "num_tokens": 805564531.0, "step": 21110 }, { "epoch": 2.685536191324259, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.06678009033203, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.859887957572937, "num_tokens": 805606011.0, "step": 21111 }, { "epoch": 2.6856634016028496, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.666927337646484, "learning_rate": 1e-06, "loss": 0.6383, "mean_token_accuracy": 0.8581324219703674, "num_tokens": 805643078.0, "step": 21112 }, { "epoch": 2.68579061188144, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.4647331237793, "learning_rate": 1e-06, "loss": 0.5908, "mean_token_accuracy": 0.8671606779098511, "num_tokens": 805678554.0, "step": 21113 }, { "epoch": 2.6859178221600306, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.437808990478516, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8738569021224976, "num_tokens": 805713736.0, "step": 21114 }, { "epoch": 2.686045032438621, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.0180778503418, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.857181191444397, "num_tokens": 805751798.0, "step": 21115 }, { "epoch": 2.6861722427172117, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.32985305786133, "learning_rate": 1e-06, "loss": 0.5896, "mean_token_accuracy": 0.8718253374099731, "num_tokens": 805789205.0, "step": 21116 }, { "epoch": 2.686299452995802, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.46779251098633, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8674532771110535, "num_tokens": 805824910.0, "step": 21117 }, { "epoch": 2.6864266632743927, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.2475471496582, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8583065271377563, "num_tokens": 805861335.0, "step": 21118 }, { "epoch": 2.686553873552983, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.37767028808594, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8599700927734375, "num_tokens": 805901488.0, "step": 21119 }, { "epoch": 2.686681083831574, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.028076171875, "learning_rate": 1e-06, "loss": 0.7276, "mean_token_accuracy": 0.8251690864562988, "num_tokens": 805941336.0, "step": 21120 }, { "epoch": 2.686808294110164, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.710628509521484, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.870564341545105, "num_tokens": 805974222.0, "step": 21121 }, { "epoch": 2.686935504388755, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 51.58815002441406, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8586572408676147, "num_tokens": 806014942.0, "step": 21122 }, { "epoch": 2.687062714667345, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.79922866821289, "learning_rate": 1e-06, "loss": 0.5724, "mean_token_accuracy": 0.8740367889404297, "num_tokens": 806048894.0, "step": 21123 }, { "epoch": 2.687189924945936, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.04499053955078, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8609948754310608, "num_tokens": 806089714.0, "step": 21124 }, { "epoch": 2.687317135224526, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.06049728393555, "learning_rate": 1e-06, "loss": 0.6476, "mean_token_accuracy": 0.8511907458305359, "num_tokens": 806128974.0, "step": 21125 }, { "epoch": 2.6874443455031165, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.484989166259766, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8606927394866943, "num_tokens": 806169857.0, "step": 21126 }, { "epoch": 2.687571555781707, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.618133544921875, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8654371500015259, "num_tokens": 806202826.0, "step": 21127 }, { "epoch": 2.6876987660602976, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.92564010620117, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8623301982879639, "num_tokens": 806242850.0, "step": 21128 }, { "epoch": 2.687825976338888, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.72551345825195, "learning_rate": 1e-06, "loss": 0.5957, "mean_token_accuracy": 0.8664370775222778, "num_tokens": 806279024.0, "step": 21129 }, { "epoch": 2.6879531866174786, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.15110397338867, "learning_rate": 1e-06, "loss": 0.6553, "mean_token_accuracy": 0.8473142981529236, "num_tokens": 806313669.0, "step": 21130 }, { "epoch": 2.688080396896069, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.65166473388672, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8659923076629639, "num_tokens": 806353128.0, "step": 21131 }, { "epoch": 2.6882076071746597, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.70367431640625, "learning_rate": 1e-06, "loss": 0.6036, "mean_token_accuracy": 0.8639742136001587, "num_tokens": 806385044.0, "step": 21132 }, { "epoch": 2.68833481745325, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.90378189086914, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8536531925201416, "num_tokens": 806422157.0, "step": 21133 }, { "epoch": 2.6884620277318407, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.03413772583008, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8638559579849243, "num_tokens": 806463180.0, "step": 21134 }, { "epoch": 2.6885892380104313, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.00732421875, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8582977652549744, "num_tokens": 806502031.0, "step": 21135 }, { "epoch": 2.688716448289022, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.67475509643555, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8647084832191467, "num_tokens": 806539585.0, "step": 21136 }, { "epoch": 2.6888436585676123, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.05707550048828, "learning_rate": 1e-06, "loss": 0.5563, "mean_token_accuracy": 0.8815871477127075, "num_tokens": 806573147.0, "step": 21137 }, { "epoch": 2.688970868846203, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.747901916503906, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.874854326248169, "num_tokens": 806609785.0, "step": 21138 }, { "epoch": 2.6890980791247934, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.078128814697266, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8687173128128052, "num_tokens": 806647709.0, "step": 21139 }, { "epoch": 2.689225289403384, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.06113815307617, "learning_rate": 1e-06, "loss": 0.5916, "mean_token_accuracy": 0.8677330017089844, "num_tokens": 806684448.0, "step": 21140 }, { "epoch": 2.6893524996819744, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.88745880126953, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8671543598175049, "num_tokens": 806725326.0, "step": 21141 }, { "epoch": 2.689479709960565, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.94504928588867, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8634509444236755, "num_tokens": 806759797.0, "step": 21142 }, { "epoch": 2.6896069202391555, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.21786880493164, "learning_rate": 1e-06, "loss": 0.5913, "mean_token_accuracy": 0.8705475330352783, "num_tokens": 806799986.0, "step": 21143 }, { "epoch": 2.6897341305177456, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.24087142944336, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8594175577163696, "num_tokens": 806843055.0, "step": 21144 }, { "epoch": 2.6898613407963365, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.90578079223633, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8779338598251343, "num_tokens": 806875897.0, "step": 21145 }, { "epoch": 2.6899885510749266, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.6255989074707, "learning_rate": 1e-06, "loss": 0.5909, "mean_token_accuracy": 0.8709936738014221, "num_tokens": 806915583.0, "step": 21146 }, { "epoch": 2.6901157613535176, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.70346450805664, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.8669198751449585, "num_tokens": 806953851.0, "step": 21147 }, { "epoch": 2.6902429716321077, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.44807052612305, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.858241856098175, "num_tokens": 806988519.0, "step": 21148 }, { "epoch": 2.6903701819106987, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.35634994506836, "learning_rate": 1e-06, "loss": 0.6052, "mean_token_accuracy": 0.865917444229126, "num_tokens": 807029612.0, "step": 21149 }, { "epoch": 2.6904973921892887, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.49943161010742, "learning_rate": 1e-06, "loss": 0.6601, "mean_token_accuracy": 0.8468737602233887, "num_tokens": 807064009.0, "step": 21150 }, { "epoch": 2.6906246024678793, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.35182571411133, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8608342409133911, "num_tokens": 807104694.0, "step": 21151 }, { "epoch": 2.69075181274647, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.67224884033203, "learning_rate": 1e-06, "loss": 0.6956, "mean_token_accuracy": 0.8405700325965881, "num_tokens": 807142500.0, "step": 21152 }, { "epoch": 2.6908790230250603, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 50.96223449707031, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.86104416847229, "num_tokens": 807172123.0, "step": 21153 }, { "epoch": 2.691006233303651, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.61017608642578, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.8659712672233582, "num_tokens": 807206422.0, "step": 21154 }, { "epoch": 2.6911334435822414, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.30023193359375, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8495968580245972, "num_tokens": 807240867.0, "step": 21155 }, { "epoch": 2.691260653860832, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.742610931396484, "learning_rate": 1e-06, "loss": 0.5799, "mean_token_accuracy": 0.8740316033363342, "num_tokens": 807275838.0, "step": 21156 }, { "epoch": 2.6913878641394224, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.73265838623047, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.87060546875, "num_tokens": 807315164.0, "step": 21157 }, { "epoch": 2.691515074418013, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.65693664550781, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8547829389572144, "num_tokens": 807358016.0, "step": 21158 }, { "epoch": 2.6916422846966035, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.875370025634766, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8630095720291138, "num_tokens": 807402751.0, "step": 21159 }, { "epoch": 2.691769494975194, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.66505432128906, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8666388392448425, "num_tokens": 807442761.0, "step": 21160 }, { "epoch": 2.6918967052537845, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.56735610961914, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8694319725036621, "num_tokens": 807478550.0, "step": 21161 }, { "epoch": 2.692023915532375, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.45984649658203, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8731194734573364, "num_tokens": 807522756.0, "step": 21162 }, { "epoch": 2.6921511258109656, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.50053787231445, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8680371046066284, "num_tokens": 807561701.0, "step": 21163 }, { "epoch": 2.692278336089556, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.350730895996094, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8620356321334839, "num_tokens": 807604058.0, "step": 21164 }, { "epoch": 2.6924055463681467, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.977298736572266, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8692789077758789, "num_tokens": 807637344.0, "step": 21165 }, { "epoch": 2.692532756646737, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.268272399902344, "learning_rate": 1e-06, "loss": 0.6434, "mean_token_accuracy": 0.8562825322151184, "num_tokens": 807676867.0, "step": 21166 }, { "epoch": 2.6926599669253277, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.17329788208008, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8667158484458923, "num_tokens": 807707183.0, "step": 21167 }, { "epoch": 2.6927871772039182, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.776466369628906, "learning_rate": 1e-06, "loss": 0.6414, "mean_token_accuracy": 0.8539142608642578, "num_tokens": 807737617.0, "step": 21168 }, { "epoch": 2.6929143874825083, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.19410705566406, "learning_rate": 1e-06, "loss": 0.6336, "mean_token_accuracy": 0.8548005819320679, "num_tokens": 807774141.0, "step": 21169 }, { "epoch": 2.6930415977610993, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.25127410888672, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8688044548034668, "num_tokens": 807813375.0, "step": 21170 }, { "epoch": 2.6931688080396894, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.28837966918945, "learning_rate": 1e-06, "loss": 0.6535, "mean_token_accuracy": 0.8486372232437134, "num_tokens": 807852317.0, "step": 21171 }, { "epoch": 2.6932960183182804, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.892887115478516, "learning_rate": 1e-06, "loss": 0.6194, "mean_token_accuracy": 0.8639345765113831, "num_tokens": 807895266.0, "step": 21172 }, { "epoch": 2.6934232285968704, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.417198181152344, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8647677898406982, "num_tokens": 807938600.0, "step": 21173 }, { "epoch": 2.693550438875461, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.80360794067383, "learning_rate": 1e-06, "loss": 0.551, "mean_token_accuracy": 0.8819133043289185, "num_tokens": 807974329.0, "step": 21174 }, { "epoch": 2.6936776491540515, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.39603042602539, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8748466968536377, "num_tokens": 808015666.0, "step": 21175 }, { "epoch": 2.693804859432642, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.66700744628906, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8750070929527283, "num_tokens": 808050176.0, "step": 21176 }, { "epoch": 2.6939320697112326, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.891639709472656, "learning_rate": 1e-06, "loss": 0.6254, "mean_token_accuracy": 0.8562597632408142, "num_tokens": 808091910.0, "step": 21177 }, { "epoch": 2.694059279989823, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.324623107910156, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8599058985710144, "num_tokens": 808128646.0, "step": 21178 }, { "epoch": 2.6941864902684136, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.22336959838867, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8670527935028076, "num_tokens": 808166544.0, "step": 21179 }, { "epoch": 2.694313700547004, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.49479675292969, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8540620803833008, "num_tokens": 808200051.0, "step": 21180 }, { "epoch": 2.6944409108255947, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.67502212524414, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8655174374580383, "num_tokens": 808239251.0, "step": 21181 }, { "epoch": 2.694568121104185, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.634002685546875, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8577739000320435, "num_tokens": 808280639.0, "step": 21182 }, { "epoch": 2.6946953313827757, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.70816421508789, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8711678981781006, "num_tokens": 808316141.0, "step": 21183 }, { "epoch": 2.6948225416613663, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.06691360473633, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8610509634017944, "num_tokens": 808359172.0, "step": 21184 }, { "epoch": 2.694949751939957, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.361236572265625, "learning_rate": 1e-06, "loss": 0.7124, "mean_token_accuracy": 0.8340458869934082, "num_tokens": 808397823.0, "step": 21185 }, { "epoch": 2.6950769622185473, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.374908447265625, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8729945421218872, "num_tokens": 808435355.0, "step": 21186 }, { "epoch": 2.695204172497138, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.642845153808594, "learning_rate": 1e-06, "loss": 0.5853, "mean_token_accuracy": 0.8700352907180786, "num_tokens": 808470792.0, "step": 21187 }, { "epoch": 2.6953313827757284, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.767822265625, "learning_rate": 1e-06, "loss": 0.5594, "mean_token_accuracy": 0.8814742565155029, "num_tokens": 808504591.0, "step": 21188 }, { "epoch": 2.695458593054319, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.983314514160156, "learning_rate": 1e-06, "loss": 0.5653, "mean_token_accuracy": 0.8752932548522949, "num_tokens": 808537167.0, "step": 21189 }, { "epoch": 2.6955858033329094, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.83733367919922, "learning_rate": 1e-06, "loss": 0.5659, "mean_token_accuracy": 0.8786484003067017, "num_tokens": 808572055.0, "step": 21190 }, { "epoch": 2.6957130136115, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.515830993652344, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8764955997467041, "num_tokens": 808605239.0, "step": 21191 }, { "epoch": 2.6958402238900905, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.38330078125, "learning_rate": 1e-06, "loss": 0.68, "mean_token_accuracy": 0.8402565121650696, "num_tokens": 808643227.0, "step": 21192 }, { "epoch": 2.695967434168681, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.321102142333984, "learning_rate": 1e-06, "loss": 0.6381, "mean_token_accuracy": 0.8561566472053528, "num_tokens": 808683855.0, "step": 21193 }, { "epoch": 2.696094644447271, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.546417236328125, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8690483570098877, "num_tokens": 808723467.0, "step": 21194 }, { "epoch": 2.696221854725862, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.664039611816406, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8513774275779724, "num_tokens": 808759805.0, "step": 21195 }, { "epoch": 2.696349065004452, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.21767044067383, "learning_rate": 1e-06, "loss": 0.6675, "mean_token_accuracy": 0.8469496965408325, "num_tokens": 808802024.0, "step": 21196 }, { "epoch": 2.696476275283043, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.735084533691406, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8541209697723389, "num_tokens": 808837086.0, "step": 21197 }, { "epoch": 2.696603485561633, "ewc_loss": 0.2060546875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018215179443359375, "grad_norm": 51.53200149536133, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8630452156066895, "num_tokens": 808875261.0, "step": 21198 }, { "epoch": 2.6967306958402237, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.00835418701172, "learning_rate": 1e-06, "loss": 0.5868, "mean_token_accuracy": 0.8701258301734924, "num_tokens": 808913044.0, "step": 21199 }, { "epoch": 2.6968579061188143, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.881446838378906, "learning_rate": 1e-06, "loss": 0.5833, "mean_token_accuracy": 0.8731701970100403, "num_tokens": 808952374.0, "step": 21200 }, { "epoch": 2.696985116397405, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.815921783447266, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8648147583007812, "num_tokens": 808994397.0, "step": 21201 }, { "epoch": 2.6971123266759953, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.932437896728516, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8673895597457886, "num_tokens": 809036401.0, "step": 21202 }, { "epoch": 2.697239536954586, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.4926872253418, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8689960241317749, "num_tokens": 809073073.0, "step": 21203 }, { "epoch": 2.6973667472331764, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.01115036010742, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8638820648193359, "num_tokens": 809117348.0, "step": 21204 }, { "epoch": 2.697493957511767, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.63819885253906, "learning_rate": 1e-06, "loss": 0.5766, "mean_token_accuracy": 0.8763361573219299, "num_tokens": 809151357.0, "step": 21205 }, { "epoch": 2.6976211677903574, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.64401626586914, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8573517799377441, "num_tokens": 809189000.0, "step": 21206 }, { "epoch": 2.697748378068948, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.117916107177734, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8662974834442139, "num_tokens": 809227380.0, "step": 21207 }, { "epoch": 2.6978755883475385, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.67775344848633, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8640490770339966, "num_tokens": 809265131.0, "step": 21208 }, { "epoch": 2.698002798626129, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.41911697387695, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8629530668258667, "num_tokens": 809297190.0, "step": 21209 }, { "epoch": 2.6981300089047195, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.0004768371582, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8729638457298279, "num_tokens": 809336774.0, "step": 21210 }, { "epoch": 2.69825721918331, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.438175201416016, "learning_rate": 1e-06, "loss": 0.6629, "mean_token_accuracy": 0.8464006185531616, "num_tokens": 809376612.0, "step": 21211 }, { "epoch": 2.6983844294619006, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.06278991699219, "learning_rate": 1e-06, "loss": 0.5347, "mean_token_accuracy": 0.8853212594985962, "num_tokens": 809416058.0, "step": 21212 }, { "epoch": 2.698511639740491, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.953453063964844, "learning_rate": 1e-06, "loss": 0.5707, "mean_token_accuracy": 0.8752884864807129, "num_tokens": 809449580.0, "step": 21213 }, { "epoch": 2.6986388500190817, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.53468322753906, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8681074380874634, "num_tokens": 809484983.0, "step": 21214 }, { "epoch": 2.698766060297672, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 50.937782287597656, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.8676503896713257, "num_tokens": 809521086.0, "step": 21215 }, { "epoch": 2.6988932705762627, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.10072708129883, "learning_rate": 1e-06, "loss": 0.6337, "mean_token_accuracy": 0.8615878820419312, "num_tokens": 809562475.0, "step": 21216 }, { "epoch": 2.699020480854853, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.17094802856445, "learning_rate": 1e-06, "loss": 0.6566, "mean_token_accuracy": 0.8507516384124756, "num_tokens": 809596686.0, "step": 21217 }, { "epoch": 2.6991476911334438, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.22760009765625, "learning_rate": 1e-06, "loss": 0.6239, "mean_token_accuracy": 0.866239070892334, "num_tokens": 809636614.0, "step": 21218 }, { "epoch": 2.699274901412034, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.447120666503906, "learning_rate": 1e-06, "loss": 0.6514, "mean_token_accuracy": 0.8518813848495483, "num_tokens": 809674542.0, "step": 21219 }, { "epoch": 2.699402111690625, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.13199234008789, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.879595935344696, "num_tokens": 809716726.0, "step": 21220 }, { "epoch": 2.699529321969215, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.607460021972656, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8729887008666992, "num_tokens": 809756790.0, "step": 21221 }, { "epoch": 2.699656532247806, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.90430450439453, "learning_rate": 1e-06, "loss": 0.6228, "mean_token_accuracy": 0.8615138530731201, "num_tokens": 809794616.0, "step": 21222 }, { "epoch": 2.699783742526396, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.67597198486328, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8701704740524292, "num_tokens": 809836662.0, "step": 21223 }, { "epoch": 2.6999109528049865, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.645015716552734, "learning_rate": 1e-06, "loss": 0.6285, "mean_token_accuracy": 0.8609548807144165, "num_tokens": 809873812.0, "step": 21224 }, { "epoch": 2.700038163083577, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.34484100341797, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.864186704158783, "num_tokens": 809913139.0, "step": 21225 }, { "epoch": 2.7001653733621676, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.194091796875, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8655922412872314, "num_tokens": 809947741.0, "step": 21226 }, { "epoch": 2.700292583640758, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.736419677734375, "learning_rate": 1e-06, "loss": 0.6435, "mean_token_accuracy": 0.8551242351531982, "num_tokens": 809987120.0, "step": 21227 }, { "epoch": 2.7004197939193486, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 50.977745056152344, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8574033975601196, "num_tokens": 810026972.0, "step": 21228 }, { "epoch": 2.700547004197939, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.84733200073242, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8639834523200989, "num_tokens": 810062767.0, "step": 21229 }, { "epoch": 2.7006742144765297, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.1203498840332, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8724409937858582, "num_tokens": 810096930.0, "step": 21230 }, { "epoch": 2.70080142475512, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.06639862060547, "learning_rate": 1e-06, "loss": 0.6673, "mean_token_accuracy": 0.8500103950500488, "num_tokens": 810134600.0, "step": 21231 }, { "epoch": 2.7009286350337107, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.203670501708984, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8657302260398865, "num_tokens": 810169535.0, "step": 21232 }, { "epoch": 2.7010558453123013, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.998775482177734, "learning_rate": 1e-06, "loss": 0.656, "mean_token_accuracy": 0.8524818420410156, "num_tokens": 810205288.0, "step": 21233 }, { "epoch": 2.701183055590892, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.449832916259766, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8613353967666626, "num_tokens": 810242868.0, "step": 21234 }, { "epoch": 2.7013102658694823, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.08296203613281, "learning_rate": 1e-06, "loss": 0.6577, "mean_token_accuracy": 0.8511144518852234, "num_tokens": 810286539.0, "step": 21235 }, { "epoch": 2.701437476148073, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.20048522949219, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8667209148406982, "num_tokens": 810331598.0, "step": 21236 }, { "epoch": 2.7015646864266634, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.873477935791016, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8713832497596741, "num_tokens": 810369501.0, "step": 21237 }, { "epoch": 2.701691896705254, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.558189392089844, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8604717254638672, "num_tokens": 810410727.0, "step": 21238 }, { "epoch": 2.7018191069838444, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.30400466918945, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8554083108901978, "num_tokens": 810450315.0, "step": 21239 }, { "epoch": 2.701946317262435, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.562355041503906, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8537764549255371, "num_tokens": 810493972.0, "step": 21240 }, { "epoch": 2.7020735275410255, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.75067901611328, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8760708570480347, "num_tokens": 810526038.0, "step": 21241 }, { "epoch": 2.7022007378196156, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.95292282104492, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8680944442749023, "num_tokens": 810560772.0, "step": 21242 }, { "epoch": 2.7023279480982065, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.32246780395508, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8710368871688843, "num_tokens": 810598322.0, "step": 21243 }, { "epoch": 2.7024551583767966, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.20933151245117, "learning_rate": 1e-06, "loss": 0.5621, "mean_token_accuracy": 0.8840584754943848, "num_tokens": 810638652.0, "step": 21244 }, { "epoch": 2.7025823686553876, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.56822204589844, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.8613619208335876, "num_tokens": 810680318.0, "step": 21245 }, { "epoch": 2.7027095789339777, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.730594635009766, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8676810264587402, "num_tokens": 810715001.0, "step": 21246 }, { "epoch": 2.7028367892125686, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.70198059082031, "learning_rate": 1e-06, "loss": 0.6579, "mean_token_accuracy": 0.8540509939193726, "num_tokens": 810748897.0, "step": 21247 }, { "epoch": 2.7029639994911587, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.89862823486328, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.857980489730835, "num_tokens": 810788954.0, "step": 21248 }, { "epoch": 2.7030912097697493, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.13178634643555, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8671616315841675, "num_tokens": 810828897.0, "step": 21249 }, { "epoch": 2.70321842004834, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.772254943847656, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8809902667999268, "num_tokens": 810866698.0, "step": 21250 }, { "epoch": 2.7033456303269303, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.4505500793457, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8555927872657776, "num_tokens": 810903140.0, "step": 21251 }, { "epoch": 2.703472840605521, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.49300003051758, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8673787713050842, "num_tokens": 810940664.0, "step": 21252 }, { "epoch": 2.7036000508841114, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.43771743774414, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.861264705657959, "num_tokens": 810981818.0, "step": 21253 }, { "epoch": 2.703727261162702, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.88374710083008, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8621912598609924, "num_tokens": 811019897.0, "step": 21254 }, { "epoch": 2.7038544714412924, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.217140197753906, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8662368655204773, "num_tokens": 811056310.0, "step": 21255 }, { "epoch": 2.703981681719883, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.86348342895508, "learning_rate": 1e-06, "loss": 0.6479, "mean_token_accuracy": 0.8549598455429077, "num_tokens": 811091505.0, "step": 21256 }, { "epoch": 2.7041088919984735, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.96826934814453, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.876821756362915, "num_tokens": 811134861.0, "step": 21257 }, { "epoch": 2.704236102277064, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.26008987426758, "learning_rate": 1e-06, "loss": 0.6479, "mean_token_accuracy": 0.8542739152908325, "num_tokens": 811170001.0, "step": 21258 }, { "epoch": 2.7043633125556545, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.97822952270508, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8699830770492554, "num_tokens": 811204203.0, "step": 21259 }, { "epoch": 2.704490522834245, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.1057243347168, "learning_rate": 1e-06, "loss": 0.6251, "mean_token_accuracy": 0.8661065101623535, "num_tokens": 811244598.0, "step": 21260 }, { "epoch": 2.7046177331128356, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.13019943237305, "learning_rate": 1e-06, "loss": 0.5817, "mean_token_accuracy": 0.8745503425598145, "num_tokens": 811283352.0, "step": 21261 }, { "epoch": 2.704744943391426, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 51.613285064697266, "learning_rate": 1e-06, "loss": 0.6607, "mean_token_accuracy": 0.8512370586395264, "num_tokens": 811317605.0, "step": 21262 }, { "epoch": 2.7048721536700167, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.71798324584961, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8787568807601929, "num_tokens": 811352117.0, "step": 21263 }, { "epoch": 2.704999363948607, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.92998123168945, "learning_rate": 1e-06, "loss": 0.6347, "mean_token_accuracy": 0.8576464653015137, "num_tokens": 811386118.0, "step": 21264 }, { "epoch": 2.7051265742271977, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.16316223144531, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8723921775817871, "num_tokens": 811424479.0, "step": 21265 }, { "epoch": 2.7052537845057882, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.05741882324219, "learning_rate": 1e-06, "loss": 0.6578, "mean_token_accuracy": 0.8516671657562256, "num_tokens": 811464683.0, "step": 21266 }, { "epoch": 2.7053809947843783, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.22005844116211, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8795626759529114, "num_tokens": 811504801.0, "step": 21267 }, { "epoch": 2.7055082050629693, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.88901138305664, "learning_rate": 1e-06, "loss": 0.6473, "mean_token_accuracy": 0.8549044132232666, "num_tokens": 811546278.0, "step": 21268 }, { "epoch": 2.7056354153415594, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.335575103759766, "learning_rate": 1e-06, "loss": 0.6494, "mean_token_accuracy": 0.8485902547836304, "num_tokens": 811587019.0, "step": 21269 }, { "epoch": 2.7057626256201504, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.242958068847656, "learning_rate": 1e-06, "loss": 0.6889, "mean_token_accuracy": 0.8396444320678711, "num_tokens": 811619843.0, "step": 21270 }, { "epoch": 2.7058898358987404, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.383506774902344, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.869455099105835, "num_tokens": 811655797.0, "step": 21271 }, { "epoch": 2.706017046177331, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.1911506652832, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8533551096916199, "num_tokens": 811691626.0, "step": 21272 }, { "epoch": 2.7061442564559215, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.61994934082031, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8651761412620544, "num_tokens": 811726046.0, "step": 21273 }, { "epoch": 2.706271466734512, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.93199920654297, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8544859290122986, "num_tokens": 811765686.0, "step": 21274 }, { "epoch": 2.7063986770131025, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.39591979980469, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8716572523117065, "num_tokens": 811801143.0, "step": 21275 }, { "epoch": 2.706525887291693, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.386627197265625, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8704675436019897, "num_tokens": 811834807.0, "step": 21276 }, { "epoch": 2.7066530975702836, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.40642547607422, "learning_rate": 1e-06, "loss": 0.5545, "mean_token_accuracy": 0.8829929828643799, "num_tokens": 811865676.0, "step": 21277 }, { "epoch": 2.706780307848874, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.476112365722656, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8593589067459106, "num_tokens": 811901259.0, "step": 21278 }, { "epoch": 2.7069075181274647, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.20203399658203, "learning_rate": 1e-06, "loss": 0.6648, "mean_token_accuracy": 0.8494859933853149, "num_tokens": 811937768.0, "step": 21279 }, { "epoch": 2.707034728406055, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.470054626464844, "learning_rate": 1e-06, "loss": 0.6183, "mean_token_accuracy": 0.8639987707138062, "num_tokens": 811967805.0, "step": 21280 }, { "epoch": 2.7071619386846457, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.31296920776367, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.858771800994873, "num_tokens": 812007468.0, "step": 21281 }, { "epoch": 2.7072891489632362, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.79695129394531, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8657628893852234, "num_tokens": 812048127.0, "step": 21282 }, { "epoch": 2.7074163592418268, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.47709655761719, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8638604879379272, "num_tokens": 812083904.0, "step": 21283 }, { "epoch": 2.7075435695204173, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.634674072265625, "learning_rate": 1e-06, "loss": 0.5947, "mean_token_accuracy": 0.8670393824577332, "num_tokens": 812119643.0, "step": 21284 }, { "epoch": 2.707670779799008, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.89486312866211, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8613111972808838, "num_tokens": 812155338.0, "step": 21285 }, { "epoch": 2.7077979900775984, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.46849060058594, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8630625605583191, "num_tokens": 812193075.0, "step": 21286 }, { "epoch": 2.707925200356189, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.22530746459961, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8724579215049744, "num_tokens": 812228163.0, "step": 21287 }, { "epoch": 2.7080524106347794, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.6035270690918, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8573793172836304, "num_tokens": 812268565.0, "step": 21288 }, { "epoch": 2.70817962091337, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.27125930786133, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8692740797996521, "num_tokens": 812307292.0, "step": 21289 }, { "epoch": 2.7083068311919605, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.81733322143555, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8739074468612671, "num_tokens": 812346180.0, "step": 21290 }, { "epoch": 2.708434041470551, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.13815689086914, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8606973886489868, "num_tokens": 812376033.0, "step": 21291 }, { "epoch": 2.708561251749141, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.326744079589844, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8818718791007996, "num_tokens": 812407925.0, "step": 21292 }, { "epoch": 2.708688462027732, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.889190673828125, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8691372275352478, "num_tokens": 812452354.0, "step": 21293 }, { "epoch": 2.708815672306322, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.984886169433594, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8615972995758057, "num_tokens": 812490015.0, "step": 21294 }, { "epoch": 2.708942882584913, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.61174392700195, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.860492467880249, "num_tokens": 812529919.0, "step": 21295 }, { "epoch": 2.709070092863503, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.11356735229492, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8769907355308533, "num_tokens": 812568541.0, "step": 21296 }, { "epoch": 2.7091973031420937, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.96839141845703, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8553765416145325, "num_tokens": 812612098.0, "step": 21297 }, { "epoch": 2.7093245134206843, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.726097106933594, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.8593275547027588, "num_tokens": 812648843.0, "step": 21298 }, { "epoch": 2.709451723699275, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.95670700073242, "learning_rate": 1e-06, "loss": 0.6184, "mean_token_accuracy": 0.8646633625030518, "num_tokens": 812692080.0, "step": 21299 }, { "epoch": 2.7095789339778653, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.212581634521484, "learning_rate": 1e-06, "loss": 0.5757, "mean_token_accuracy": 0.877424955368042, "num_tokens": 812729441.0, "step": 21300 }, { "epoch": 2.709706144256456, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.826904296875, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8576375246047974, "num_tokens": 812768908.0, "step": 21301 }, { "epoch": 2.7098333545350464, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.409034729003906, "learning_rate": 1e-06, "loss": 0.6082, "mean_token_accuracy": 0.8619365096092224, "num_tokens": 812808384.0, "step": 21302 }, { "epoch": 2.709960564813637, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.23426818847656, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8562543988227844, "num_tokens": 812847658.0, "step": 21303 }, { "epoch": 2.7100877750922274, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.88204574584961, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8632845878601074, "num_tokens": 812889394.0, "step": 21304 }, { "epoch": 2.710214985370818, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.628944396972656, "learning_rate": 1e-06, "loss": 0.6516, "mean_token_accuracy": 0.8490917682647705, "num_tokens": 812924202.0, "step": 21305 }, { "epoch": 2.7103421956494085, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.823570251464844, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.8830362558364868, "num_tokens": 812954391.0, "step": 21306 }, { "epoch": 2.710469405927999, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.5569953918457, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.8544992208480835, "num_tokens": 812989498.0, "step": 21307 }, { "epoch": 2.7105966162065895, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.848602294921875, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8699569702148438, "num_tokens": 813030968.0, "step": 21308 }, { "epoch": 2.71072382648518, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.24415588378906, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8866479396820068, "num_tokens": 813065843.0, "step": 21309 }, { "epoch": 2.7108510367637706, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.60854721069336, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8696692585945129, "num_tokens": 813107410.0, "step": 21310 }, { "epoch": 2.710978247042361, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.075714111328125, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8710487484931946, "num_tokens": 813155959.0, "step": 21311 }, { "epoch": 2.7111054573209517, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.706871032714844, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8712645769119263, "num_tokens": 813194133.0, "step": 21312 }, { "epoch": 2.711232667599542, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.22050476074219, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8513412475585938, "num_tokens": 813231015.0, "step": 21313 }, { "epoch": 2.7113598778781327, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.80434799194336, "learning_rate": 1e-06, "loss": 0.5508, "mean_token_accuracy": 0.8793724775314331, "num_tokens": 813271497.0, "step": 21314 }, { "epoch": 2.711487088156723, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.51881790161133, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8645623922348022, "num_tokens": 813305758.0, "step": 21315 }, { "epoch": 2.7116142984353138, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 52.57377243041992, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8628734350204468, "num_tokens": 813345958.0, "step": 21316 }, { "epoch": 2.711741508713904, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.81943893432617, "learning_rate": 1e-06, "loss": 0.5849, "mean_token_accuracy": 0.8670421242713928, "num_tokens": 813381630.0, "step": 21317 }, { "epoch": 2.711868718992495, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.708038330078125, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8634911775588989, "num_tokens": 813421320.0, "step": 21318 }, { "epoch": 2.711995929271085, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.5626106262207, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8595721125602722, "num_tokens": 813458323.0, "step": 21319 }, { "epoch": 2.712123139549676, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.74386215209961, "learning_rate": 1e-06, "loss": 0.677, "mean_token_accuracy": 0.8460478782653809, "num_tokens": 813501436.0, "step": 21320 }, { "epoch": 2.712250349828266, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.774803161621094, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8651566505432129, "num_tokens": 813545942.0, "step": 21321 }, { "epoch": 2.7123775601068565, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.366966247558594, "learning_rate": 1e-06, "loss": 0.6279, "mean_token_accuracy": 0.8580945730209351, "num_tokens": 813582202.0, "step": 21322 }, { "epoch": 2.712504770385447, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.968116760253906, "learning_rate": 1e-06, "loss": 0.6562, "mean_token_accuracy": 0.8497002124786377, "num_tokens": 813621711.0, "step": 21323 }, { "epoch": 2.7126319806640375, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.36262512207031, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.858295202255249, "num_tokens": 813665404.0, "step": 21324 }, { "epoch": 2.712759190942628, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.05990982055664, "learning_rate": 1e-06, "loss": 0.5813, "mean_token_accuracy": 0.8697330951690674, "num_tokens": 813697470.0, "step": 21325 }, { "epoch": 2.7128864012212186, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.54562759399414, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8722138404846191, "num_tokens": 813733308.0, "step": 21326 }, { "epoch": 2.713013611499809, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.06700897216797, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.850736141204834, "num_tokens": 813770881.0, "step": 21327 }, { "epoch": 2.7131408217783997, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.55598831176758, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8685364723205566, "num_tokens": 813806495.0, "step": 21328 }, { "epoch": 2.71326803205699, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.81589889526367, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8687617778778076, "num_tokens": 813842408.0, "step": 21329 }, { "epoch": 2.7133952423355807, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.25360107421875, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8611903190612793, "num_tokens": 813883041.0, "step": 21330 }, { "epoch": 2.7135224526141712, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.38276672363281, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8592439889907837, "num_tokens": 813922901.0, "step": 21331 }, { "epoch": 2.7136496628927618, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.55195617675781, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8656893968582153, "num_tokens": 813961948.0, "step": 21332 }, { "epoch": 2.7137768731713523, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.10250473022461, "learning_rate": 1e-06, "loss": 0.562, "mean_token_accuracy": 0.877863347530365, "num_tokens": 813999194.0, "step": 21333 }, { "epoch": 2.713904083449943, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.27145767211914, "learning_rate": 1e-06, "loss": 0.6635, "mean_token_accuracy": 0.8534607887268066, "num_tokens": 814038986.0, "step": 21334 }, { "epoch": 2.7140312937285334, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018310546875, "grad_norm": 50.668460845947266, "learning_rate": 1e-06, "loss": 0.5622, "mean_token_accuracy": 0.8797390460968018, "num_tokens": 814074351.0, "step": 21335 }, { "epoch": 2.714158504007124, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.2913932800293, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8639860153198242, "num_tokens": 814109255.0, "step": 21336 }, { "epoch": 2.7142857142857144, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 50.72282791137695, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.859076201915741, "num_tokens": 814146601.0, "step": 21337 }, { "epoch": 2.714412924564305, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.209373474121094, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8701305389404297, "num_tokens": 814181299.0, "step": 21338 }, { "epoch": 2.7145401348428955, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.20630645751953, "learning_rate": 1e-06, "loss": 0.5774, "mean_token_accuracy": 0.8739575147628784, "num_tokens": 814218178.0, "step": 21339 }, { "epoch": 2.7146673451214856, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.6594352722168, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.866279661655426, "num_tokens": 814254161.0, "step": 21340 }, { "epoch": 2.7147945554000765, "ewc_loss": 0.20703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018405914306640625, "grad_norm": 51.539390563964844, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8647079467773438, "num_tokens": 814290164.0, "step": 21341 }, { "epoch": 2.7149217656786666, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.18871307373047, "learning_rate": 1e-06, "loss": 0.6906, "mean_token_accuracy": 0.8431856036186218, "num_tokens": 814334459.0, "step": 21342 }, { "epoch": 2.7150489759572576, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.60056686401367, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.855904757976532, "num_tokens": 814374291.0, "step": 21343 }, { "epoch": 2.7151761862358477, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.75893783569336, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8597806692123413, "num_tokens": 814413196.0, "step": 21344 }, { "epoch": 2.7153033965144386, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.72357177734375, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8604177236557007, "num_tokens": 814455312.0, "step": 21345 }, { "epoch": 2.7154306067930287, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.693763732910156, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.8597211837768555, "num_tokens": 814496586.0, "step": 21346 }, { "epoch": 2.7155578170716193, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.049644470214844, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8690296411514282, "num_tokens": 814531566.0, "step": 21347 }, { "epoch": 2.71568502735021, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.98004913330078, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8707667589187622, "num_tokens": 814566168.0, "step": 21348 }, { "epoch": 2.7158122376288003, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.94285583496094, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8642234802246094, "num_tokens": 814604927.0, "step": 21349 }, { "epoch": 2.715939447907391, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.26799392700195, "learning_rate": 1e-06, "loss": 0.5658, "mean_token_accuracy": 0.877927303314209, "num_tokens": 814641401.0, "step": 21350 }, { "epoch": 2.7160666581859814, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.36384963989258, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8688211441040039, "num_tokens": 814674252.0, "step": 21351 }, { "epoch": 2.716193868464572, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.34319305419922, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8631793260574341, "num_tokens": 814704965.0, "step": 21352 }, { "epoch": 2.7163210787431624, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.831390380859375, "learning_rate": 1e-06, "loss": 0.7006, "mean_token_accuracy": 0.8344206809997559, "num_tokens": 814743740.0, "step": 21353 }, { "epoch": 2.716448289021753, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.9083137512207, "learning_rate": 1e-06, "loss": 0.5985, "mean_token_accuracy": 0.8707499504089355, "num_tokens": 814784598.0, "step": 21354 }, { "epoch": 2.7165754993003435, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.741111755371094, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.858246386051178, "num_tokens": 814821437.0, "step": 21355 }, { "epoch": 2.716702709578934, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.2089958190918, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8617170453071594, "num_tokens": 814861638.0, "step": 21356 }, { "epoch": 2.7168299198575245, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.843868255615234, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8669484853744507, "num_tokens": 814893799.0, "step": 21357 }, { "epoch": 2.716957130136115, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.53284454345703, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8721633553504944, "num_tokens": 814930726.0, "step": 21358 }, { "epoch": 2.7170843404147056, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.81468963623047, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.860995888710022, "num_tokens": 814970041.0, "step": 21359 }, { "epoch": 2.717211550693296, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.20862579345703, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8667122721672058, "num_tokens": 815008587.0, "step": 21360 }, { "epoch": 2.7173387609718866, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.225868225097656, "learning_rate": 1e-06, "loss": 0.6505, "mean_token_accuracy": 0.849831223487854, "num_tokens": 815047659.0, "step": 21361 }, { "epoch": 2.717465971250477, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.83025360107422, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8642708659172058, "num_tokens": 815082631.0, "step": 21362 }, { "epoch": 2.7175931815290677, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.161033630371094, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8811783194541931, "num_tokens": 815117568.0, "step": 21363 }, { "epoch": 2.7177203918076582, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.857688903808594, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8647681474685669, "num_tokens": 815155019.0, "step": 21364 }, { "epoch": 2.7178476020862483, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.62015914916992, "learning_rate": 1e-06, "loss": 0.6077, "mean_token_accuracy": 0.8643122911453247, "num_tokens": 815195386.0, "step": 21365 }, { "epoch": 2.7179748123648393, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.059112548828125, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8690222501754761, "num_tokens": 815228304.0, "step": 21366 }, { "epoch": 2.7181020226434294, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.09821319580078, "learning_rate": 1e-06, "loss": 0.5434, "mean_token_accuracy": 0.8834455609321594, "num_tokens": 815260111.0, "step": 21367 }, { "epoch": 2.7182292329220203, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.32513427734375, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8629902601242065, "num_tokens": 815297272.0, "step": 21368 }, { "epoch": 2.7183564432006104, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.79791259765625, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8642559051513672, "num_tokens": 815336016.0, "step": 21369 }, { "epoch": 2.718483653479201, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.290130615234375, "learning_rate": 1e-06, "loss": 0.6907, "mean_token_accuracy": 0.8365648984909058, "num_tokens": 815373940.0, "step": 21370 }, { "epoch": 2.7186108637577915, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.58256149291992, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.8526367545127869, "num_tokens": 815410836.0, "step": 21371 }, { "epoch": 2.718738074036382, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.34912872314453, "learning_rate": 1e-06, "loss": 0.5886, "mean_token_accuracy": 0.8703484535217285, "num_tokens": 815449577.0, "step": 21372 }, { "epoch": 2.7188652843149725, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.25263977050781, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8658768534660339, "num_tokens": 815485869.0, "step": 21373 }, { "epoch": 2.718992494593563, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.01088333129883, "learning_rate": 1e-06, "loss": 0.6691, "mean_token_accuracy": 0.8447695970535278, "num_tokens": 815523614.0, "step": 21374 }, { "epoch": 2.7191197048721536, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.210575103759766, "learning_rate": 1e-06, "loss": 0.6741, "mean_token_accuracy": 0.8431572914123535, "num_tokens": 815561835.0, "step": 21375 }, { "epoch": 2.719246915150744, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.69306564331055, "learning_rate": 1e-06, "loss": 0.6576, "mean_token_accuracy": 0.8472051620483398, "num_tokens": 815605596.0, "step": 21376 }, { "epoch": 2.7193741254293347, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.56907653808594, "learning_rate": 1e-06, "loss": 0.6308, "mean_token_accuracy": 0.8585296869277954, "num_tokens": 815638720.0, "step": 21377 }, { "epoch": 2.719501335707925, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.924591064453125, "learning_rate": 1e-06, "loss": 0.5579, "mean_token_accuracy": 0.8801266551017761, "num_tokens": 815673996.0, "step": 21378 }, { "epoch": 2.7196285459865157, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.931846618652344, "learning_rate": 1e-06, "loss": 0.6548, "mean_token_accuracy": 0.8503884673118591, "num_tokens": 815716571.0, "step": 21379 }, { "epoch": 2.7197557562651062, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.94913864135742, "learning_rate": 1e-06, "loss": 0.5859, "mean_token_accuracy": 0.8713660836219788, "num_tokens": 815748973.0, "step": 21380 }, { "epoch": 2.7198829665436968, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.0764045715332, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8659743070602417, "num_tokens": 815794754.0, "step": 21381 }, { "epoch": 2.7200101768222873, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.07522964477539, "learning_rate": 1e-06, "loss": 0.5983, "mean_token_accuracy": 0.8657622933387756, "num_tokens": 815831051.0, "step": 21382 }, { "epoch": 2.720137387100878, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.90298843383789, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8658010363578796, "num_tokens": 815874291.0, "step": 21383 }, { "epoch": 2.7202645973794684, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.10847854614258, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.859511137008667, "num_tokens": 815912484.0, "step": 21384 }, { "epoch": 2.720391807658059, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.0054817199707, "learning_rate": 1e-06, "loss": 0.6714, "mean_token_accuracy": 0.8469350337982178, "num_tokens": 815948626.0, "step": 21385 }, { "epoch": 2.7205190179366494, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.28019332885742, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8738398551940918, "num_tokens": 815983154.0, "step": 21386 }, { "epoch": 2.72064622821524, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.444969177246094, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8657379150390625, "num_tokens": 816017524.0, "step": 21387 }, { "epoch": 2.7207734384938305, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.19292449951172, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.8586389422416687, "num_tokens": 816055204.0, "step": 21388 }, { "epoch": 2.720900648772421, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.5954704284668, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8716752529144287, "num_tokens": 816086671.0, "step": 21389 }, { "epoch": 2.721027859051011, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.304527282714844, "learning_rate": 1e-06, "loss": 0.6933, "mean_token_accuracy": 0.8425593376159668, "num_tokens": 816114173.0, "step": 21390 }, { "epoch": 2.721155069329602, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.84589385986328, "learning_rate": 1e-06, "loss": 0.5717, "mean_token_accuracy": 0.8759401440620422, "num_tokens": 816156434.0, "step": 21391 }, { "epoch": 2.721282279608192, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.78060531616211, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8578800559043884, "num_tokens": 816191037.0, "step": 21392 }, { "epoch": 2.721409489886783, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.692840576171875, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8795251846313477, "num_tokens": 816227031.0, "step": 21393 }, { "epoch": 2.721536700165373, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.89814758300781, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8669164180755615, "num_tokens": 816264308.0, "step": 21394 }, { "epoch": 2.7216639104439637, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.90433883666992, "learning_rate": 1e-06, "loss": 0.6214, "mean_token_accuracy": 0.8608819246292114, "num_tokens": 816300955.0, "step": 21395 }, { "epoch": 2.7217911207225542, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.539737701416016, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.863700807094574, "num_tokens": 816332881.0, "step": 21396 }, { "epoch": 2.7219183310011448, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.5582160949707, "learning_rate": 1e-06, "loss": 0.5528, "mean_token_accuracy": 0.8829915523529053, "num_tokens": 816367446.0, "step": 21397 }, { "epoch": 2.7220455412797353, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.05345153808594, "learning_rate": 1e-06, "loss": 0.6663, "mean_token_accuracy": 0.8477894067764282, "num_tokens": 816407286.0, "step": 21398 }, { "epoch": 2.722172751558326, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.94374084472656, "learning_rate": 1e-06, "loss": 0.5691, "mean_token_accuracy": 0.8737382888793945, "num_tokens": 816445277.0, "step": 21399 }, { "epoch": 2.7222999618369164, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.801998138427734, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8708232641220093, "num_tokens": 816481638.0, "step": 21400 }, { "epoch": 2.722427172115507, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.24942398071289, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8661535382270813, "num_tokens": 816521386.0, "step": 21401 }, { "epoch": 2.7225543823940974, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.704830169677734, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8762280344963074, "num_tokens": 816554760.0, "step": 21402 }, { "epoch": 2.722681592672688, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.94564437866211, "learning_rate": 1e-06, "loss": 0.5832, "mean_token_accuracy": 0.8726414442062378, "num_tokens": 816595195.0, "step": 21403 }, { "epoch": 2.7228088029512785, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.23538589477539, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8672295808792114, "num_tokens": 816633046.0, "step": 21404 }, { "epoch": 2.722936013229869, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.81006622314453, "learning_rate": 1e-06, "loss": 0.5745, "mean_token_accuracy": 0.879390299320221, "num_tokens": 816675283.0, "step": 21405 }, { "epoch": 2.7230632235084595, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.151668548583984, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8544285297393799, "num_tokens": 816716499.0, "step": 21406 }, { "epoch": 2.72319043378705, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.76387405395508, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.858110785484314, "num_tokens": 816761825.0, "step": 21407 }, { "epoch": 2.7233176440656406, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.39597702026367, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8714950680732727, "num_tokens": 816792080.0, "step": 21408 }, { "epoch": 2.723444854344231, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.5875358581543, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.8549861907958984, "num_tokens": 816832121.0, "step": 21409 }, { "epoch": 2.7235720646228216, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.41562271118164, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8574011921882629, "num_tokens": 816874032.0, "step": 21410 }, { "epoch": 2.723699274901412, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.989036560058594, "learning_rate": 1e-06, "loss": 0.6732, "mean_token_accuracy": 0.8454201221466064, "num_tokens": 816920471.0, "step": 21411 }, { "epoch": 2.7238264851800027, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 50.9409065246582, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8633159399032593, "num_tokens": 816958042.0, "step": 21412 }, { "epoch": 2.723953695458593, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.982845306396484, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8540059328079224, "num_tokens": 816992158.0, "step": 21413 }, { "epoch": 2.7240809057371838, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.523292541503906, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8523589372634888, "num_tokens": 817032552.0, "step": 21414 }, { "epoch": 2.724208116015774, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.609867095947266, "learning_rate": 1e-06, "loss": 0.644, "mean_token_accuracy": 0.8631207346916199, "num_tokens": 817072240.0, "step": 21415 }, { "epoch": 2.724335326294365, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.18937301635742, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8547051548957825, "num_tokens": 817104741.0, "step": 21416 }, { "epoch": 2.724462536572955, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 50.77302551269531, "learning_rate": 1e-06, "loss": 0.6568, "mean_token_accuracy": 0.8508706092834473, "num_tokens": 817143876.0, "step": 21417 }, { "epoch": 2.724589746851546, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.54169464111328, "learning_rate": 1e-06, "loss": 0.5428, "mean_token_accuracy": 0.8876668214797974, "num_tokens": 817183647.0, "step": 21418 }, { "epoch": 2.724716957130136, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 50.94824981689453, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8583294153213501, "num_tokens": 817214520.0, "step": 21419 }, { "epoch": 2.7248441674087265, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.04830551147461, "learning_rate": 1e-06, "loss": 0.65, "mean_token_accuracy": 0.8541746139526367, "num_tokens": 817256936.0, "step": 21420 }, { "epoch": 2.724971377687317, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.47548294067383, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8651143908500671, "num_tokens": 817292840.0, "step": 21421 }, { "epoch": 2.7250985879659075, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.89198303222656, "learning_rate": 1e-06, "loss": 0.6539, "mean_token_accuracy": 0.8587389588356018, "num_tokens": 817332872.0, "step": 21422 }, { "epoch": 2.725225798244498, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.541988372802734, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8713907599449158, "num_tokens": 817367668.0, "step": 21423 }, { "epoch": 2.7253530085230886, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.06203079223633, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8822627067565918, "num_tokens": 817407726.0, "step": 21424 }, { "epoch": 2.725480218801679, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.308189392089844, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8748200535774231, "num_tokens": 817441003.0, "step": 21425 }, { "epoch": 2.7256074290802697, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.28302764892578, "learning_rate": 1e-06, "loss": 0.593, "mean_token_accuracy": 0.8697239756584167, "num_tokens": 817482587.0, "step": 21426 }, { "epoch": 2.72573463935886, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.2262077331543, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8678107261657715, "num_tokens": 817519727.0, "step": 21427 }, { "epoch": 2.7258618496374507, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.26866912841797, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8592390418052673, "num_tokens": 817553809.0, "step": 21428 }, { "epoch": 2.7259890599160412, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.294639587402344, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8473463654518127, "num_tokens": 817588893.0, "step": 21429 }, { "epoch": 2.7261162701946318, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.10961151123047, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.871518611907959, "num_tokens": 817627789.0, "step": 21430 }, { "epoch": 2.7262434804732223, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.337921142578125, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8541876077651978, "num_tokens": 817670048.0, "step": 21431 }, { "epoch": 2.726370690751813, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.914764404296875, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8694132566452026, "num_tokens": 817707108.0, "step": 21432 }, { "epoch": 2.7264979010304033, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.12839126586914, "learning_rate": 1e-06, "loss": 0.6202, "mean_token_accuracy": 0.8613801002502441, "num_tokens": 817748126.0, "step": 21433 }, { "epoch": 2.726625111308994, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.22262954711914, "learning_rate": 1e-06, "loss": 0.6076, "mean_token_accuracy": 0.8662669658660889, "num_tokens": 817781578.0, "step": 21434 }, { "epoch": 2.7267523215875844, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.976417541503906, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8706121444702148, "num_tokens": 817822143.0, "step": 21435 }, { "epoch": 2.726879531866175, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.367958068847656, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.874419093132019, "num_tokens": 817858456.0, "step": 21436 }, { "epoch": 2.7270067421447655, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.85746765136719, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8718370199203491, "num_tokens": 817892863.0, "step": 21437 }, { "epoch": 2.7271339524233555, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.49015808105469, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8670226335525513, "num_tokens": 817931120.0, "step": 21438 }, { "epoch": 2.7272611627019465, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.58741760253906, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8658238649368286, "num_tokens": 817963076.0, "step": 21439 }, { "epoch": 2.7273883729805366, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.54856872558594, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8642752766609192, "num_tokens": 818003844.0, "step": 21440 }, { "epoch": 2.7275155832591276, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.27766799926758, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8669307231903076, "num_tokens": 818048533.0, "step": 21441 }, { "epoch": 2.7276427935377177, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.18696975708008, "learning_rate": 1e-06, "loss": 0.6436, "mean_token_accuracy": 0.8602233529090881, "num_tokens": 818083290.0, "step": 21442 }, { "epoch": 2.7277700038163086, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.38380432128906, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.873142421245575, "num_tokens": 818120037.0, "step": 21443 }, { "epoch": 2.7278972140948987, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.00776672363281, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8641537427902222, "num_tokens": 818164122.0, "step": 21444 }, { "epoch": 2.7280244243734892, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.0208740234375, "learning_rate": 1e-06, "loss": 0.6133, "mean_token_accuracy": 0.8669732809066772, "num_tokens": 818198065.0, "step": 21445 }, { "epoch": 2.7281516346520798, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.13154602050781, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8645061254501343, "num_tokens": 818236843.0, "step": 21446 }, { "epoch": 2.7282788449306703, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.583160400390625, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8648630380630493, "num_tokens": 818271300.0, "step": 21447 }, { "epoch": 2.728406055209261, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.993350982666016, "learning_rate": 1e-06, "loss": 0.6663, "mean_token_accuracy": 0.8484452962875366, "num_tokens": 818311799.0, "step": 21448 }, { "epoch": 2.7285332654878514, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.5821533203125, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8706188201904297, "num_tokens": 818346662.0, "step": 21449 }, { "epoch": 2.728660475766442, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.97431564331055, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8676583766937256, "num_tokens": 818384465.0, "step": 21450 }, { "epoch": 2.7287876860450324, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.373130798339844, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8619203567504883, "num_tokens": 818425625.0, "step": 21451 }, { "epoch": 2.728914896323623, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.64659881591797, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8695342540740967, "num_tokens": 818458544.0, "step": 21452 }, { "epoch": 2.7290421066022135, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.59541320800781, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8597880005836487, "num_tokens": 818503507.0, "step": 21453 }, { "epoch": 2.729169316880804, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.60972595214844, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8706855177879333, "num_tokens": 818545411.0, "step": 21454 }, { "epoch": 2.7292965271593945, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.516944885253906, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8602811694145203, "num_tokens": 818582100.0, "step": 21455 }, { "epoch": 2.729423737437985, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.057308197021484, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8619858622550964, "num_tokens": 818615549.0, "step": 21456 }, { "epoch": 2.7295509477165756, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.427616119384766, "learning_rate": 1e-06, "loss": 0.5926, "mean_token_accuracy": 0.8686541318893433, "num_tokens": 818652476.0, "step": 21457 }, { "epoch": 2.729678157995166, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.80751037597656, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8693587183952332, "num_tokens": 818692435.0, "step": 21458 }, { "epoch": 2.7298053682737566, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.86943817138672, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8636630773544312, "num_tokens": 818733206.0, "step": 21459 }, { "epoch": 2.729932578552347, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.43609619140625, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8658579587936401, "num_tokens": 818771717.0, "step": 21460 }, { "epoch": 2.7300597888309377, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.14036560058594, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.8627040982246399, "num_tokens": 818806999.0, "step": 21461 }, { "epoch": 2.7301869991095282, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.47538757324219, "learning_rate": 1e-06, "loss": 0.5851, "mean_token_accuracy": 0.8709568977355957, "num_tokens": 818846226.0, "step": 21462 }, { "epoch": 2.7303142093881183, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.645328521728516, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8695781826972961, "num_tokens": 818885667.0, "step": 21463 }, { "epoch": 2.7304414196667093, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.65776824951172, "learning_rate": 1e-06, "loss": 0.5878, "mean_token_accuracy": 0.8715765476226807, "num_tokens": 818924009.0, "step": 21464 }, { "epoch": 2.7305686299452994, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.13019561767578, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8716075420379639, "num_tokens": 818962614.0, "step": 21465 }, { "epoch": 2.7306958402238903, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.663604736328125, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8655672669410706, "num_tokens": 819004486.0, "step": 21466 }, { "epoch": 2.7308230505024804, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.663963317871094, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8572520613670349, "num_tokens": 819048364.0, "step": 21467 }, { "epoch": 2.730950260781071, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.9030876159668, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8609033823013306, "num_tokens": 819086250.0, "step": 21468 }, { "epoch": 2.7310774710596615, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.81451416015625, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8757997751235962, "num_tokens": 819126705.0, "step": 21469 }, { "epoch": 2.731204681338252, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.35081100463867, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8655211925506592, "num_tokens": 819165967.0, "step": 21470 }, { "epoch": 2.7313318916168425, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.60971450805664, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8614626526832581, "num_tokens": 819208066.0, "step": 21471 }, { "epoch": 2.731459101895433, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.0815315246582, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.861344575881958, "num_tokens": 819246269.0, "step": 21472 }, { "epoch": 2.7315863121740236, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.687068939208984, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8672517538070679, "num_tokens": 819286027.0, "step": 21473 }, { "epoch": 2.731713522452614, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.63953399658203, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8724299669265747, "num_tokens": 819319994.0, "step": 21474 }, { "epoch": 2.7318407327312046, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.61150360107422, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8590834140777588, "num_tokens": 819360113.0, "step": 21475 }, { "epoch": 2.731967943009795, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.072078704833984, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8711053133010864, "num_tokens": 819394575.0, "step": 21476 }, { "epoch": 2.7320951532883857, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.710609436035156, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8578182458877563, "num_tokens": 819434545.0, "step": 21477 }, { "epoch": 2.7322223635669762, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.594886779785156, "learning_rate": 1e-06, "loss": 0.5816, "mean_token_accuracy": 0.8746688961982727, "num_tokens": 819470699.0, "step": 21478 }, { "epoch": 2.7323495738455668, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.70778274536133, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8587046265602112, "num_tokens": 819514447.0, "step": 21479 }, { "epoch": 2.7324767841241573, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.283546447753906, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.8587198257446289, "num_tokens": 819553434.0, "step": 21480 }, { "epoch": 2.732603994402748, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.93891906738281, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.8575113415718079, "num_tokens": 819591594.0, "step": 21481 }, { "epoch": 2.7327312046813383, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.7984619140625, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8616520166397095, "num_tokens": 819634530.0, "step": 21482 }, { "epoch": 2.732858414959929, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.5900764465332, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.8545065522193909, "num_tokens": 819669358.0, "step": 21483 }, { "epoch": 2.7329856252385194, "ewc_loss": 0.21875, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.13351821899414, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8627413511276245, "num_tokens": 819711336.0, "step": 21484 }, { "epoch": 2.73311283551711, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.42753982543945, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8535916805267334, "num_tokens": 819746973.0, "step": 21485 }, { "epoch": 2.7332400457957005, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.94464874267578, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.8737137913703918, "num_tokens": 819780305.0, "step": 21486 }, { "epoch": 2.733367256074291, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.009368896484375, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8731369376182556, "num_tokens": 819817326.0, "step": 21487 }, { "epoch": 2.733494466352881, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.84104537963867, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.8581067323684692, "num_tokens": 819857597.0, "step": 21488 }, { "epoch": 2.733621676631472, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.945945739746094, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8653690814971924, "num_tokens": 819893471.0, "step": 21489 }, { "epoch": 2.733748886910062, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.873802185058594, "learning_rate": 1e-06, "loss": 0.6761, "mean_token_accuracy": 0.8470380306243896, "num_tokens": 819934175.0, "step": 21490 }, { "epoch": 2.733876097188653, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.871070861816406, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8731019496917725, "num_tokens": 819971451.0, "step": 21491 }, { "epoch": 2.734003307467243, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.73170471191406, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8596117496490479, "num_tokens": 820011566.0, "step": 21492 }, { "epoch": 2.7341305177458337, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.0649299621582, "learning_rate": 1e-06, "loss": 0.6961, "mean_token_accuracy": 0.8391456007957458, "num_tokens": 820047944.0, "step": 21493 }, { "epoch": 2.7342577280244242, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.47770309448242, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8516765832901001, "num_tokens": 820083416.0, "step": 21494 }, { "epoch": 2.7343849383030148, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.05471420288086, "learning_rate": 1e-06, "loss": 0.6582, "mean_token_accuracy": 0.8567827343940735, "num_tokens": 820122476.0, "step": 21495 }, { "epoch": 2.7345121485816053, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.43010711669922, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.869493305683136, "num_tokens": 820159866.0, "step": 21496 }, { "epoch": 2.734639358860196, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.07172775268555, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.8634272813796997, "num_tokens": 820202710.0, "step": 21497 }, { "epoch": 2.7347665691387864, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.905033111572266, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8623310327529907, "num_tokens": 820239156.0, "step": 21498 }, { "epoch": 2.734893779417377, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.71578598022461, "learning_rate": 1e-06, "loss": 0.5498, "mean_token_accuracy": 0.8821450471878052, "num_tokens": 820274010.0, "step": 21499 }, { "epoch": 2.7350209896959674, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.814544677734375, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.862630307674408, "num_tokens": 820319171.0, "step": 21500 }, { "epoch": 2.735148199974558, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.5088005065918, "learning_rate": 1e-06, "loss": 0.665, "mean_token_accuracy": 0.8488417863845825, "num_tokens": 820355038.0, "step": 21501 }, { "epoch": 2.7352754102531485, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.78916549682617, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8593869209289551, "num_tokens": 820390973.0, "step": 21502 }, { "epoch": 2.735402620531739, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.936302185058594, "learning_rate": 1e-06, "loss": 0.6348, "mean_token_accuracy": 0.8570652604103088, "num_tokens": 820424543.0, "step": 21503 }, { "epoch": 2.7355298308103295, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.74892807006836, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8636270761489868, "num_tokens": 820462702.0, "step": 21504 }, { "epoch": 2.73565704108892, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.52525329589844, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.874523401260376, "num_tokens": 820498840.0, "step": 21505 }, { "epoch": 2.7357842513675106, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.54865646362305, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.8577557802200317, "num_tokens": 820534083.0, "step": 21506 }, { "epoch": 2.735911461646101, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.958412170410156, "learning_rate": 1e-06, "loss": 0.6081, "mean_token_accuracy": 0.8659211993217468, "num_tokens": 820572479.0, "step": 21507 }, { "epoch": 2.7360386719246916, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.242088317871094, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.8516899347305298, "num_tokens": 820610283.0, "step": 21508 }, { "epoch": 2.736165882203282, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 51.88557434082031, "learning_rate": 1e-06, "loss": 0.5705, "mean_token_accuracy": 0.8808579444885254, "num_tokens": 820647580.0, "step": 21509 }, { "epoch": 2.7362930924818727, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.51730728149414, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8691961765289307, "num_tokens": 820686774.0, "step": 21510 }, { "epoch": 2.7364203027604628, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.625755310058594, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8738155364990234, "num_tokens": 820724635.0, "step": 21511 }, { "epoch": 2.7365475130390537, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.687068939208984, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8715091943740845, "num_tokens": 820761467.0, "step": 21512 }, { "epoch": 2.736674723317644, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.53976058959961, "learning_rate": 1e-06, "loss": 0.6563, "mean_token_accuracy": 0.8498786687850952, "num_tokens": 820801001.0, "step": 21513 }, { "epoch": 2.736801933596235, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.646087646484375, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.859135627746582, "num_tokens": 820841028.0, "step": 21514 }, { "epoch": 2.736929143874825, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.63737487792969, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8731619119644165, "num_tokens": 820881613.0, "step": 21515 }, { "epoch": 2.737056354153416, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.58411407470703, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.872895359992981, "num_tokens": 820918205.0, "step": 21516 }, { "epoch": 2.737183564432006, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.48271560668945, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8566226959228516, "num_tokens": 820950751.0, "step": 21517 }, { "epoch": 2.7373107747105965, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.0139274597168, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.8656080961227417, "num_tokens": 820987385.0, "step": 21518 }, { "epoch": 2.737437984989187, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.15520095825195, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8563150763511658, "num_tokens": 821026273.0, "step": 21519 }, { "epoch": 2.7375651952677775, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.229488372802734, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8556287288665771, "num_tokens": 821067978.0, "step": 21520 }, { "epoch": 2.737692405546368, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.263092041015625, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8686641454696655, "num_tokens": 821111925.0, "step": 21521 }, { "epoch": 2.7378196158249586, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.12537384033203, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8671399354934692, "num_tokens": 821151019.0, "step": 21522 }, { "epoch": 2.737946826103549, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.42101287841797, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.870814323425293, "num_tokens": 821185520.0, "step": 21523 }, { "epoch": 2.7380740363821396, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.05560302734375, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8651858568191528, "num_tokens": 821224273.0, "step": 21524 }, { "epoch": 2.73820124666073, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.46580505371094, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8709376454353333, "num_tokens": 821263541.0, "step": 21525 }, { "epoch": 2.7383284569393207, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.15403747558594, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8522879481315613, "num_tokens": 821304053.0, "step": 21526 }, { "epoch": 2.7384556672179112, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.694854736328125, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8641318082809448, "num_tokens": 821339710.0, "step": 21527 }, { "epoch": 2.7385828774965018, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.93954086303711, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.863304853439331, "num_tokens": 821378377.0, "step": 21528 }, { "epoch": 2.7387100877750923, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.63164520263672, "learning_rate": 1e-06, "loss": 0.5883, "mean_token_accuracy": 0.8739120960235596, "num_tokens": 821417434.0, "step": 21529 }, { "epoch": 2.738837298053683, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.833072662353516, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.8601224422454834, "num_tokens": 821464323.0, "step": 21530 }, { "epoch": 2.7389645083322733, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.82339859008789, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.8620632290840149, "num_tokens": 821509891.0, "step": 21531 }, { "epoch": 2.739091718610864, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.55662155151367, "learning_rate": 1e-06, "loss": 0.6617, "mean_token_accuracy": 0.8533506989479065, "num_tokens": 821557309.0, "step": 21532 }, { "epoch": 2.7392189288894544, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.07036590576172, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8598067760467529, "num_tokens": 821597751.0, "step": 21533 }, { "epoch": 2.739346139168045, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.33361053466797, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8767132759094238, "num_tokens": 821637961.0, "step": 21534 }, { "epoch": 2.7394733494466355, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.070369720458984, "learning_rate": 1e-06, "loss": 0.6628, "mean_token_accuracy": 0.8479585647583008, "num_tokens": 821684728.0, "step": 21535 }, { "epoch": 2.7396005597252255, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.492427825927734, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8694508075714111, "num_tokens": 821715359.0, "step": 21536 }, { "epoch": 2.7397277700038165, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.887386322021484, "learning_rate": 1e-06, "loss": 0.6408, "mean_token_accuracy": 0.8592097163200378, "num_tokens": 821755540.0, "step": 21537 }, { "epoch": 2.7398549802824066, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.482574462890625, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.86424720287323, "num_tokens": 821788506.0, "step": 21538 }, { "epoch": 2.7399821905609976, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.1663818359375, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8622335195541382, "num_tokens": 821826555.0, "step": 21539 }, { "epoch": 2.7401094008395877, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.36149978637695, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8539134860038757, "num_tokens": 821866160.0, "step": 21540 }, { "epoch": 2.7402366111181786, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.07488250732422, "learning_rate": 1e-06, "loss": 0.5963, "mean_token_accuracy": 0.8693141937255859, "num_tokens": 821905595.0, "step": 21541 }, { "epoch": 2.7403638213967687, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 53.04741287231445, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8548473119735718, "num_tokens": 821944769.0, "step": 21542 }, { "epoch": 2.7404910316753592, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.003761291503906, "learning_rate": 1e-06, "loss": 0.5887, "mean_token_accuracy": 0.8717741370201111, "num_tokens": 821988889.0, "step": 21543 }, { "epoch": 2.7406182419539498, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.055477142333984, "learning_rate": 1e-06, "loss": 0.6008, "mean_token_accuracy": 0.8669153451919556, "num_tokens": 822025343.0, "step": 21544 }, { "epoch": 2.7407454522325403, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.081268310546875, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8517873287200928, "num_tokens": 822067651.0, "step": 21545 }, { "epoch": 2.740872662511131, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.24907302856445, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8551477193832397, "num_tokens": 822102885.0, "step": 21546 }, { "epoch": 2.7409998727897213, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.290000915527344, "learning_rate": 1e-06, "loss": 0.6442, "mean_token_accuracy": 0.8557640314102173, "num_tokens": 822142515.0, "step": 21547 }, { "epoch": 2.741127083068312, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.38576126098633, "learning_rate": 1e-06, "loss": 0.6263, "mean_token_accuracy": 0.8601986169815063, "num_tokens": 822180745.0, "step": 21548 }, { "epoch": 2.7412542933469024, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.218997955322266, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8733291029930115, "num_tokens": 822215117.0, "step": 21549 }, { "epoch": 2.741381503625493, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.860172271728516, "learning_rate": 1e-06, "loss": 0.6517, "mean_token_accuracy": 0.8545650839805603, "num_tokens": 822258883.0, "step": 21550 }, { "epoch": 2.7415087139040835, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.15148162841797, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8604769706726074, "num_tokens": 822295254.0, "step": 21551 }, { "epoch": 2.741635924182674, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 53.07533264160156, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8662862181663513, "num_tokens": 822335769.0, "step": 21552 }, { "epoch": 2.7417631344612645, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.14625930786133, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8724505305290222, "num_tokens": 822376873.0, "step": 21553 }, { "epoch": 2.741890344739855, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.99888610839844, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8590293526649475, "num_tokens": 822419868.0, "step": 21554 }, { "epoch": 2.7420175550184456, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.99201202392578, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8638862371444702, "num_tokens": 822456757.0, "step": 21555 }, { "epoch": 2.742144765297036, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.566898345947266, "learning_rate": 1e-06, "loss": 0.568, "mean_token_accuracy": 0.8768466711044312, "num_tokens": 822492772.0, "step": 21556 }, { "epoch": 2.7422719755756266, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.61452865600586, "learning_rate": 1e-06, "loss": 0.6688, "mean_token_accuracy": 0.8469339609146118, "num_tokens": 822535400.0, "step": 21557 }, { "epoch": 2.742399185854217, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.588279724121094, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8706183433532715, "num_tokens": 822568313.0, "step": 21558 }, { "epoch": 2.7425263961328077, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.636104583740234, "learning_rate": 1e-06, "loss": 0.5929, "mean_token_accuracy": 0.8689188957214355, "num_tokens": 822608487.0, "step": 21559 }, { "epoch": 2.742653606411398, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.702850341796875, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8638424873352051, "num_tokens": 822647938.0, "step": 21560 }, { "epoch": 2.7427808166899883, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 51.77653121948242, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8605375289916992, "num_tokens": 822685974.0, "step": 21561 }, { "epoch": 2.7429080269685793, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.497459411621094, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8656986951828003, "num_tokens": 822719166.0, "step": 21562 }, { "epoch": 2.7430352372471694, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.82674789428711, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8536459803581238, "num_tokens": 822758364.0, "step": 21563 }, { "epoch": 2.7431624475257603, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.34083938598633, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8707154393196106, "num_tokens": 822798002.0, "step": 21564 }, { "epoch": 2.7432896578043504, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.03434371948242, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8633829355239868, "num_tokens": 822837958.0, "step": 21565 }, { "epoch": 2.743416868082941, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.577186584472656, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8642239570617676, "num_tokens": 822873076.0, "step": 21566 }, { "epoch": 2.7435440783615315, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.10982894897461, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.863538384437561, "num_tokens": 822904693.0, "step": 21567 }, { "epoch": 2.743671288640122, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.741943359375, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8589730262756348, "num_tokens": 822941192.0, "step": 21568 }, { "epoch": 2.7437984989187125, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.869266510009766, "learning_rate": 1e-06, "loss": 0.6519, "mean_token_accuracy": 0.8510568141937256, "num_tokens": 822982166.0, "step": 21569 }, { "epoch": 2.743925709197303, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.28921890258789, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8571863174438477, "num_tokens": 823021630.0, "step": 21570 }, { "epoch": 2.7440529194758936, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.336727142333984, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8624817728996277, "num_tokens": 823062133.0, "step": 21571 }, { "epoch": 2.744180129754484, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.07022476196289, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8606569766998291, "num_tokens": 823105163.0, "step": 21572 }, { "epoch": 2.7443073400330746, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.764503479003906, "learning_rate": 1e-06, "loss": 0.6864, "mean_token_accuracy": 0.8428301811218262, "num_tokens": 823146878.0, "step": 21573 }, { "epoch": 2.744434550311665, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.273536682128906, "learning_rate": 1e-06, "loss": 0.5585, "mean_token_accuracy": 0.8808249831199646, "num_tokens": 823182216.0, "step": 21574 }, { "epoch": 2.7445617605902557, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.788002014160156, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8715084791183472, "num_tokens": 823227838.0, "step": 21575 }, { "epoch": 2.7446889708688462, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.3526496887207, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8546335697174072, "num_tokens": 823268609.0, "step": 21576 }, { "epoch": 2.7448161811474368, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.629093170166016, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8729733228683472, "num_tokens": 823309426.0, "step": 21577 }, { "epoch": 2.7449433914260273, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.35417175292969, "learning_rate": 1e-06, "loss": 0.6296, "mean_token_accuracy": 0.856878399848938, "num_tokens": 823346049.0, "step": 21578 }, { "epoch": 2.745070601704618, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.32185745239258, "learning_rate": 1e-06, "loss": 0.6549, "mean_token_accuracy": 0.857343316078186, "num_tokens": 823378915.0, "step": 21579 }, { "epoch": 2.7451978119832083, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.38966369628906, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8533837795257568, "num_tokens": 823422985.0, "step": 21580 }, { "epoch": 2.745325022261799, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.20768737792969, "learning_rate": 1e-06, "loss": 0.6613, "mean_token_accuracy": 0.8497376441955566, "num_tokens": 823460703.0, "step": 21581 }, { "epoch": 2.7454522325403894, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.316810607910156, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8659553527832031, "num_tokens": 823499651.0, "step": 21582 }, { "epoch": 2.74557944281898, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.153038024902344, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8628831505775452, "num_tokens": 823537044.0, "step": 21583 }, { "epoch": 2.7457066530975704, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.80641555786133, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8773003816604614, "num_tokens": 823571832.0, "step": 21584 }, { "epoch": 2.745833863376161, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.99005889892578, "learning_rate": 1e-06, "loss": 0.591, "mean_token_accuracy": 0.8703000545501709, "num_tokens": 823609852.0, "step": 21585 }, { "epoch": 2.745961073654751, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.865543365478516, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8665545582771301, "num_tokens": 823644419.0, "step": 21586 }, { "epoch": 2.746088283933342, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.0458984375, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8689547777175903, "num_tokens": 823682451.0, "step": 21587 }, { "epoch": 2.746215494211932, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.12031173706055, "learning_rate": 1e-06, "loss": 0.587, "mean_token_accuracy": 0.8746465444564819, "num_tokens": 823723051.0, "step": 21588 }, { "epoch": 2.746342704490523, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.37189483642578, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8616623878479004, "num_tokens": 823761353.0, "step": 21589 }, { "epoch": 2.746469914769113, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.1615104675293, "learning_rate": 1e-06, "loss": 0.6783, "mean_token_accuracy": 0.848575234413147, "num_tokens": 823800281.0, "step": 21590 }, { "epoch": 2.7465971250477037, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.7531852722168, "learning_rate": 1e-06, "loss": 0.5678, "mean_token_accuracy": 0.8773922920227051, "num_tokens": 823843255.0, "step": 21591 }, { "epoch": 2.7467243353262942, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.295631408691406, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8599321842193604, "num_tokens": 823880736.0, "step": 21592 }, { "epoch": 2.7468515456048848, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.02154541015625, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8598585724830627, "num_tokens": 823923232.0, "step": 21593 }, { "epoch": 2.7469787558834753, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 53.0324592590332, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8688504695892334, "num_tokens": 823957488.0, "step": 21594 }, { "epoch": 2.747105966162066, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.431392669677734, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8591421246528625, "num_tokens": 823999568.0, "step": 21595 }, { "epoch": 2.7472331764406563, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.950897216796875, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8740154504776001, "num_tokens": 824036947.0, "step": 21596 }, { "epoch": 2.747360386719247, "ewc_loss": 0.208984375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018596649169921875, "grad_norm": 52.51436233520508, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8757678866386414, "num_tokens": 824067966.0, "step": 21597 }, { "epoch": 2.7474875969978374, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.52903747558594, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8617502450942993, "num_tokens": 824112216.0, "step": 21598 }, { "epoch": 2.747614807276428, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.27644348144531, "learning_rate": 1e-06, "loss": 0.5697, "mean_token_accuracy": 0.8735582232475281, "num_tokens": 824148749.0, "step": 21599 }, { "epoch": 2.7477420175550185, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.558868408203125, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8617836236953735, "num_tokens": 824186685.0, "step": 21600 }, { "epoch": 2.747869227833609, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.682281494140625, "learning_rate": 1e-06, "loss": 0.6336, "mean_token_accuracy": 0.8577300310134888, "num_tokens": 824223716.0, "step": 21601 }, { "epoch": 2.7479964381121995, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.238956451416016, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8620426654815674, "num_tokens": 824264627.0, "step": 21602 }, { "epoch": 2.74812364839079, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.54145431518555, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8645110130310059, "num_tokens": 824301901.0, "step": 21603 }, { "epoch": 2.7482508586693806, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.57602310180664, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8679238557815552, "num_tokens": 824344621.0, "step": 21604 }, { "epoch": 2.748378068947971, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.94905090332031, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.870775580406189, "num_tokens": 824385850.0, "step": 21605 }, { "epoch": 2.7485052792265616, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.23844528198242, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8904659152030945, "num_tokens": 824425545.0, "step": 21606 }, { "epoch": 2.748632489505152, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.73255920410156, "learning_rate": 1e-06, "loss": 0.5654, "mean_token_accuracy": 0.8792328834533691, "num_tokens": 824468611.0, "step": 21607 }, { "epoch": 2.7487596997837427, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.39382553100586, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8682827949523926, "num_tokens": 824512161.0, "step": 21608 }, { "epoch": 2.7488869100623328, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.9604606628418, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8764089345932007, "num_tokens": 824555502.0, "step": 21609 }, { "epoch": 2.7490141203409237, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.434471130371094, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8587409853935242, "num_tokens": 824596277.0, "step": 21610 }, { "epoch": 2.749141330619514, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.72193145751953, "learning_rate": 1e-06, "loss": 0.6642, "mean_token_accuracy": 0.8496955633163452, "num_tokens": 824633704.0, "step": 21611 }, { "epoch": 2.749268540898105, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.712364196777344, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8680269718170166, "num_tokens": 824673584.0, "step": 21612 }, { "epoch": 2.749395751176695, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.54353332519531, "learning_rate": 1e-06, "loss": 0.6565, "mean_token_accuracy": 0.8546308279037476, "num_tokens": 824712508.0, "step": 21613 }, { "epoch": 2.749522961455286, "ewc_loss": 0.2080078125, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001850128173828125, "grad_norm": 51.98326110839844, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8751419186592102, "num_tokens": 824754353.0, "step": 21614 }, { "epoch": 2.749650171733876, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.92469024658203, "learning_rate": 1e-06, "loss": 0.6717, "mean_token_accuracy": 0.8455339670181274, "num_tokens": 824796333.0, "step": 21615 }, { "epoch": 2.7497773820124665, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.2213249206543, "learning_rate": 1e-06, "loss": 0.6219, "mean_token_accuracy": 0.8580471873283386, "num_tokens": 824832048.0, "step": 21616 }, { "epoch": 2.749904592291057, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.04157257080078, "learning_rate": 1e-06, "loss": 0.6473, "mean_token_accuracy": 0.8575356006622314, "num_tokens": 824871889.0, "step": 21617 }, { "epoch": 2.7500318025696475, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.25949478149414, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8738561868667603, "num_tokens": 824912547.0, "step": 21618 }, { "epoch": 2.750159012848238, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.65431594848633, "learning_rate": 1e-06, "loss": 0.6888, "mean_token_accuracy": 0.8419632315635681, "num_tokens": 824955626.0, "step": 21619 }, { "epoch": 2.7502862231268286, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.46746826171875, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8714863657951355, "num_tokens": 824991813.0, "step": 21620 }, { "epoch": 2.750413433405419, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.82563400268555, "learning_rate": 1e-06, "loss": 0.5783, "mean_token_accuracy": 0.8733556270599365, "num_tokens": 825024356.0, "step": 21621 }, { "epoch": 2.7505406436840096, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.34150695800781, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8594503402709961, "num_tokens": 825060316.0, "step": 21622 }, { "epoch": 2.7506678539626, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.21949005126953, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8567909002304077, "num_tokens": 825096044.0, "step": 21623 }, { "epoch": 2.7507950642411907, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.33384323120117, "learning_rate": 1e-06, "loss": 0.6116, "mean_token_accuracy": 0.8610842227935791, "num_tokens": 825123999.0, "step": 21624 }, { "epoch": 2.750922274519781, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.88467025756836, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8527863025665283, "num_tokens": 825170155.0, "step": 21625 }, { "epoch": 2.7510494847983717, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.37562561035156, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.8620022535324097, "num_tokens": 825213388.0, "step": 21626 }, { "epoch": 2.7511766950769623, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.244224548339844, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8694761991500854, "num_tokens": 825247171.0, "step": 21627 }, { "epoch": 2.751303905355553, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.481082916259766, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8630656003952026, "num_tokens": 825283271.0, "step": 21628 }, { "epoch": 2.7514311156341433, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.911293029785156, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8633503913879395, "num_tokens": 825318976.0, "step": 21629 }, { "epoch": 2.751558325912734, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.506866455078125, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8623470067977905, "num_tokens": 825358019.0, "step": 21630 }, { "epoch": 2.7516855361913244, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.20761489868164, "learning_rate": 1e-06, "loss": 0.5694, "mean_token_accuracy": 0.8813493251800537, "num_tokens": 825394025.0, "step": 21631 }, { "epoch": 2.751812746469915, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 53.29978561401367, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8739752769470215, "num_tokens": 825433021.0, "step": 21632 }, { "epoch": 2.7519399567485054, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.5794563293457, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8631102442741394, "num_tokens": 825477026.0, "step": 21633 }, { "epoch": 2.7520671670270955, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.69786834716797, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8688817620277405, "num_tokens": 825511054.0, "step": 21634 }, { "epoch": 2.7521943773056865, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.957969665527344, "learning_rate": 1e-06, "loss": 0.6551, "mean_token_accuracy": 0.8500329256057739, "num_tokens": 825549654.0, "step": 21635 }, { "epoch": 2.7523215875842766, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.251094818115234, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8651270270347595, "num_tokens": 825587269.0, "step": 21636 }, { "epoch": 2.7524487978628676, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.059043884277344, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.8883928060531616, "num_tokens": 825624864.0, "step": 21637 }, { "epoch": 2.7525760081414576, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.285926818847656, "learning_rate": 1e-06, "loss": 0.6453, "mean_token_accuracy": 0.8563531637191772, "num_tokens": 825667025.0, "step": 21638 }, { "epoch": 2.7527032184200486, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.373870849609375, "learning_rate": 1e-06, "loss": 0.6658, "mean_token_accuracy": 0.8483526706695557, "num_tokens": 825709505.0, "step": 21639 }, { "epoch": 2.7528304286986387, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 53.107051849365234, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8672959208488464, "num_tokens": 825747376.0, "step": 21640 }, { "epoch": 2.7529576389772292, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.33527755737305, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8710321187973022, "num_tokens": 825784817.0, "step": 21641 }, { "epoch": 2.7530848492558198, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.56904983520508, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8602193593978882, "num_tokens": 825815631.0, "step": 21642 }, { "epoch": 2.7532120595344103, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.50811767578125, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8636295795440674, "num_tokens": 825854202.0, "step": 21643 }, { "epoch": 2.753339269813001, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.08103942871094, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8718165755271912, "num_tokens": 825890636.0, "step": 21644 }, { "epoch": 2.7534664800915913, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.32035446166992, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8397907018661499, "num_tokens": 825931054.0, "step": 21645 }, { "epoch": 2.753593690370182, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.57915115356445, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8588548898696899, "num_tokens": 825972519.0, "step": 21646 }, { "epoch": 2.7537209006487724, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.61484146118164, "learning_rate": 1e-06, "loss": 0.6005, "mean_token_accuracy": 0.8665865659713745, "num_tokens": 826010010.0, "step": 21647 }, { "epoch": 2.753848110927363, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.440738677978516, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8645744323730469, "num_tokens": 826048348.0, "step": 21648 }, { "epoch": 2.7539753212059535, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.64855194091797, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.861266016960144, "num_tokens": 826087807.0, "step": 21649 }, { "epoch": 2.754102531484544, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.40258026123047, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.859756350517273, "num_tokens": 826124657.0, "step": 21650 }, { "epoch": 2.7542297417631345, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.44984817504883, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8652160167694092, "num_tokens": 826164190.0, "step": 21651 }, { "epoch": 2.754356952041725, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.97572708129883, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8647922277450562, "num_tokens": 826202876.0, "step": 21652 }, { "epoch": 2.7544841623203156, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.935394287109375, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.86104416847229, "num_tokens": 826242955.0, "step": 21653 }, { "epoch": 2.754611372598906, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.87897491455078, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8737912178039551, "num_tokens": 826279925.0, "step": 21654 }, { "epoch": 2.7547385828774966, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.8802490234375, "learning_rate": 1e-06, "loss": 0.5815, "mean_token_accuracy": 0.8740147352218628, "num_tokens": 826314076.0, "step": 21655 }, { "epoch": 2.754865793156087, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.90513229370117, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8596150279045105, "num_tokens": 826354138.0, "step": 21656 }, { "epoch": 2.7549930034346777, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.643821716308594, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8577098846435547, "num_tokens": 826395393.0, "step": 21657 }, { "epoch": 2.755120213713268, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.92955780029297, "learning_rate": 1e-06, "loss": 0.6588, "mean_token_accuracy": 0.8530384302139282, "num_tokens": 826437539.0, "step": 21658 }, { "epoch": 2.7552474239918583, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.60325241088867, "learning_rate": 1e-06, "loss": 0.6442, "mean_token_accuracy": 0.8562988042831421, "num_tokens": 826475857.0, "step": 21659 }, { "epoch": 2.7553746342704493, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.655948638916016, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8724632263183594, "num_tokens": 826513956.0, "step": 21660 }, { "epoch": 2.7555018445490393, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.74964904785156, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.86150062084198, "num_tokens": 826552747.0, "step": 21661 }, { "epoch": 2.7556290548276303, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.0592041015625, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8707851767539978, "num_tokens": 826590830.0, "step": 21662 }, { "epoch": 2.7557562651062204, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.02471160888672, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.866727352142334, "num_tokens": 826633414.0, "step": 21663 }, { "epoch": 2.755883475384811, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.42523193359375, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8728082180023193, "num_tokens": 826668344.0, "step": 21664 }, { "epoch": 2.7560106856634015, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.404212951660156, "learning_rate": 1e-06, "loss": 0.6774, "mean_token_accuracy": 0.8484209775924683, "num_tokens": 826710587.0, "step": 21665 }, { "epoch": 2.756137895941992, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.935340881347656, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8533769845962524, "num_tokens": 826748854.0, "step": 21666 }, { "epoch": 2.7562651062205825, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.427974700927734, "learning_rate": 1e-06, "loss": 0.6734, "mean_token_accuracy": 0.85008305311203, "num_tokens": 826789591.0, "step": 21667 }, { "epoch": 2.756392316499173, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.52541732788086, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8587127923965454, "num_tokens": 826831890.0, "step": 21668 }, { "epoch": 2.7565195267777636, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.78939437866211, "learning_rate": 1e-06, "loss": 0.6858, "mean_token_accuracy": 0.8450543880462646, "num_tokens": 826870265.0, "step": 21669 }, { "epoch": 2.756646737056354, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.78035354614258, "learning_rate": 1e-06, "loss": 0.605, "mean_token_accuracy": 0.862617552280426, "num_tokens": 826910993.0, "step": 21670 }, { "epoch": 2.7567739473349446, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.77322006225586, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8608211278915405, "num_tokens": 826947492.0, "step": 21671 }, { "epoch": 2.756901157613535, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.32662582397461, "learning_rate": 1e-06, "loss": 0.6037, "mean_token_accuracy": 0.865275502204895, "num_tokens": 826987215.0, "step": 21672 }, { "epoch": 2.7570283678921257, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 54.211029052734375, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8597142696380615, "num_tokens": 827024092.0, "step": 21673 }, { "epoch": 2.757155578170716, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 51.36604309082031, "learning_rate": 1e-06, "loss": 0.6758, "mean_token_accuracy": 0.8467506170272827, "num_tokens": 827060541.0, "step": 21674 }, { "epoch": 2.7572827884493067, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.741294860839844, "learning_rate": 1e-06, "loss": 0.6311, "mean_token_accuracy": 0.8627468943595886, "num_tokens": 827101205.0, "step": 21675 }, { "epoch": 2.7574099987278973, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.227474212646484, "learning_rate": 1e-06, "loss": 0.6366, "mean_token_accuracy": 0.8549524545669556, "num_tokens": 827132441.0, "step": 21676 }, { "epoch": 2.757537209006488, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.87969970703125, "learning_rate": 1e-06, "loss": 0.6279, "mean_token_accuracy": 0.8631613850593567, "num_tokens": 827169530.0, "step": 21677 }, { "epoch": 2.7576644192850783, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.422828674316406, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8678334355354309, "num_tokens": 827207352.0, "step": 21678 }, { "epoch": 2.757791629563669, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.81538391113281, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8564698696136475, "num_tokens": 827242790.0, "step": 21679 }, { "epoch": 2.7579188398422594, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.91082000732422, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.8643168807029724, "num_tokens": 827281676.0, "step": 21680 }, { "epoch": 2.75804605012085, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.458091735839844, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.875417172908783, "num_tokens": 827313676.0, "step": 21681 }, { "epoch": 2.7581732603994404, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.97066116333008, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8623759150505066, "num_tokens": 827355017.0, "step": 21682 }, { "epoch": 2.758300470678031, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.64373779296875, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.880800724029541, "num_tokens": 827391773.0, "step": 21683 }, { "epoch": 2.758427680956621, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.44613265991211, "learning_rate": 1e-06, "loss": 0.577, "mean_token_accuracy": 0.8785281181335449, "num_tokens": 827433034.0, "step": 21684 }, { "epoch": 2.758554891235212, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.87470626831055, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8754076957702637, "num_tokens": 827475729.0, "step": 21685 }, { "epoch": 2.758682101513802, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.91605758666992, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.867160439491272, "num_tokens": 827514127.0, "step": 21686 }, { "epoch": 2.758809311792393, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.20953369140625, "learning_rate": 1e-06, "loss": 0.5903, "mean_token_accuracy": 0.8741427063941956, "num_tokens": 827546709.0, "step": 21687 }, { "epoch": 2.758936522070983, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.20110321044922, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8833035230636597, "num_tokens": 827581450.0, "step": 21688 }, { "epoch": 2.7590637323495737, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.020545959472656, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8601242303848267, "num_tokens": 827623161.0, "step": 21689 }, { "epoch": 2.7591909426281642, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.97739791870117, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8594009876251221, "num_tokens": 827659460.0, "step": 21690 }, { "epoch": 2.7593181529067548, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.25490188598633, "learning_rate": 1e-06, "loss": 0.6428, "mean_token_accuracy": 0.8577187061309814, "num_tokens": 827701110.0, "step": 21691 }, { "epoch": 2.7594453631853453, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.770469665527344, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8599541187286377, "num_tokens": 827738992.0, "step": 21692 }, { "epoch": 2.759572573463936, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.68021011352539, "learning_rate": 1e-06, "loss": 0.6643, "mean_token_accuracy": 0.8488096594810486, "num_tokens": 827775853.0, "step": 21693 }, { "epoch": 2.7596997837425263, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.66321563720703, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8660008311271667, "num_tokens": 827819438.0, "step": 21694 }, { "epoch": 2.759826994021117, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.60014724731445, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8574663996696472, "num_tokens": 827858833.0, "step": 21695 }, { "epoch": 2.7599542042997074, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.03062057495117, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8555148839950562, "num_tokens": 827895069.0, "step": 21696 }, { "epoch": 2.760081414578298, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.86263656616211, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.876511812210083, "num_tokens": 827932970.0, "step": 21697 }, { "epoch": 2.7602086248568884, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.888179779052734, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8750737309455872, "num_tokens": 827981859.0, "step": 21698 }, { "epoch": 2.760335835135479, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.79636764526367, "learning_rate": 1e-06, "loss": 0.6426, "mean_token_accuracy": 0.8568298816680908, "num_tokens": 828022018.0, "step": 21699 }, { "epoch": 2.7604630454140695, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.141361236572266, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8568482398986816, "num_tokens": 828061546.0, "step": 21700 }, { "epoch": 2.76059025569266, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.062374114990234, "learning_rate": 1e-06, "loss": 0.6674, "mean_token_accuracy": 0.8492915630340576, "num_tokens": 828104464.0, "step": 21701 }, { "epoch": 2.7607174659712506, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.37594223022461, "learning_rate": 1e-06, "loss": 0.6617, "mean_token_accuracy": 0.8526418209075928, "num_tokens": 828142527.0, "step": 21702 }, { "epoch": 2.760844676249841, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.59335708618164, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8644767999649048, "num_tokens": 828189670.0, "step": 21703 }, { "epoch": 2.7609718865284316, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.89292526245117, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8707578182220459, "num_tokens": 828231511.0, "step": 21704 }, { "epoch": 2.761099096807022, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.10282516479492, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.863372802734375, "num_tokens": 828274387.0, "step": 21705 }, { "epoch": 2.7612263070856127, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.16162872314453, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8561937808990479, "num_tokens": 828312263.0, "step": 21706 }, { "epoch": 2.7613535173642028, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.05189514160156, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8652198910713196, "num_tokens": 828345550.0, "step": 21707 }, { "epoch": 2.7614807276427937, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.0180549621582, "learning_rate": 1e-06, "loss": 0.6644, "mean_token_accuracy": 0.8510982990264893, "num_tokens": 828378633.0, "step": 21708 }, { "epoch": 2.761607937921384, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.285614013671875, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8564154505729675, "num_tokens": 828414292.0, "step": 21709 }, { "epoch": 2.761735148199975, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.451663970947266, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8623872995376587, "num_tokens": 828452682.0, "step": 21710 }, { "epoch": 2.761862358478565, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.530216217041016, "learning_rate": 1e-06, "loss": 0.5701, "mean_token_accuracy": 0.8799084424972534, "num_tokens": 828494162.0, "step": 21711 }, { "epoch": 2.761989568757156, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.687591552734375, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.8680412769317627, "num_tokens": 828525997.0, "step": 21712 }, { "epoch": 2.762116779035746, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.211421966552734, "learning_rate": 1e-06, "loss": 0.6266, "mean_token_accuracy": 0.8602396249771118, "num_tokens": 828562488.0, "step": 21713 }, { "epoch": 2.7622439893143365, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.93035888671875, "learning_rate": 1e-06, "loss": 0.5635, "mean_token_accuracy": 0.8797551393508911, "num_tokens": 828591818.0, "step": 21714 }, { "epoch": 2.762371199592927, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.187538146972656, "learning_rate": 1e-06, "loss": 0.6594, "mean_token_accuracy": 0.8511576652526855, "num_tokens": 828628872.0, "step": 21715 }, { "epoch": 2.7624984098715175, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.142555236816406, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8719728589057922, "num_tokens": 828670939.0, "step": 21716 }, { "epoch": 2.762625620150108, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.53572463989258, "learning_rate": 1e-06, "loss": 0.6744, "mean_token_accuracy": 0.8442103266716003, "num_tokens": 828708446.0, "step": 21717 }, { "epoch": 2.7627528304286986, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.06496047973633, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8676552176475525, "num_tokens": 828749873.0, "step": 21718 }, { "epoch": 2.762880040707289, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.21828842163086, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.855139970779419, "num_tokens": 828790066.0, "step": 21719 }, { "epoch": 2.7630072509858796, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.55232238769531, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.858975887298584, "num_tokens": 828821911.0, "step": 21720 }, { "epoch": 2.76313446126447, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.31983184814453, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.8569494485855103, "num_tokens": 828856572.0, "step": 21721 }, { "epoch": 2.7632616715430607, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.24176025390625, "learning_rate": 1e-06, "loss": 0.5741, "mean_token_accuracy": 0.8754814267158508, "num_tokens": 828898865.0, "step": 21722 }, { "epoch": 2.763388881821651, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.16633987426758, "learning_rate": 1e-06, "loss": 0.5934, "mean_token_accuracy": 0.871688723564148, "num_tokens": 828933768.0, "step": 21723 }, { "epoch": 2.7635160921002417, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.97808837890625, "learning_rate": 1e-06, "loss": 0.5748, "mean_token_accuracy": 0.8795213103294373, "num_tokens": 828968869.0, "step": 21724 }, { "epoch": 2.7636433023788323, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.857147216796875, "learning_rate": 1e-06, "loss": 0.6799, "mean_token_accuracy": 0.8485586643218994, "num_tokens": 829005621.0, "step": 21725 }, { "epoch": 2.763770512657423, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.040061950683594, "learning_rate": 1e-06, "loss": 0.6576, "mean_token_accuracy": 0.851698100566864, "num_tokens": 829045046.0, "step": 21726 }, { "epoch": 2.7638977229360133, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.9150390625, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8762063384056091, "num_tokens": 829084588.0, "step": 21727 }, { "epoch": 2.764024933214604, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.69112014770508, "learning_rate": 1e-06, "loss": 0.69, "mean_token_accuracy": 0.846333920955658, "num_tokens": 829121010.0, "step": 21728 }, { "epoch": 2.7641521434931944, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.33632278442383, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8729428648948669, "num_tokens": 829159618.0, "step": 21729 }, { "epoch": 2.764279353771785, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.55511474609375, "learning_rate": 1e-06, "loss": 0.6707, "mean_token_accuracy": 0.8501008749008179, "num_tokens": 829204826.0, "step": 21730 }, { "epoch": 2.7644065640503754, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.571781158447266, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.8739307522773743, "num_tokens": 829240741.0, "step": 21731 }, { "epoch": 2.7645337743289655, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.161842346191406, "learning_rate": 1e-06, "loss": 0.5652, "mean_token_accuracy": 0.8825719952583313, "num_tokens": 829278065.0, "step": 21732 }, { "epoch": 2.7646609846075565, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.86066436767578, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8595837354660034, "num_tokens": 829319438.0, "step": 21733 }, { "epoch": 2.7647881948861466, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.42239761352539, "learning_rate": 1e-06, "loss": 0.5857, "mean_token_accuracy": 0.8741440773010254, "num_tokens": 829357299.0, "step": 21734 }, { "epoch": 2.7649154051647375, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.898292541503906, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8644673228263855, "num_tokens": 829398600.0, "step": 21735 }, { "epoch": 2.7650426154433276, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.298458099365234, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8515506386756897, "num_tokens": 829439214.0, "step": 21736 }, { "epoch": 2.7651698257219186, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.59185028076172, "learning_rate": 1e-06, "loss": 0.6621, "mean_token_accuracy": 0.8516960144042969, "num_tokens": 829477634.0, "step": 21737 }, { "epoch": 2.7652970360005087, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.210453033447266, "learning_rate": 1e-06, "loss": 0.5974, "mean_token_accuracy": 0.8705984354019165, "num_tokens": 829514527.0, "step": 21738 }, { "epoch": 2.765424246279099, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.65135192871094, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8648679852485657, "num_tokens": 829558531.0, "step": 21739 }, { "epoch": 2.7655514565576897, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.99826431274414, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8658487796783447, "num_tokens": 829600769.0, "step": 21740 }, { "epoch": 2.7656786668362803, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.59035110473633, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8674922585487366, "num_tokens": 829635938.0, "step": 21741 }, { "epoch": 2.765805877114871, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.29713439941406, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8614619970321655, "num_tokens": 829672956.0, "step": 21742 }, { "epoch": 2.7659330873934613, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.36549758911133, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8623700141906738, "num_tokens": 829712658.0, "step": 21743 }, { "epoch": 2.766060297672052, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.52118682861328, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.866405189037323, "num_tokens": 829755429.0, "step": 21744 }, { "epoch": 2.7661875079506424, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.51770782470703, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.861811637878418, "num_tokens": 829796373.0, "step": 21745 }, { "epoch": 2.766314718229233, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.37408447265625, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8583983182907104, "num_tokens": 829834049.0, "step": 21746 }, { "epoch": 2.7664419285078234, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.639286041259766, "learning_rate": 1e-06, "loss": 0.6196, "mean_token_accuracy": 0.8632100224494934, "num_tokens": 829870853.0, "step": 21747 }, { "epoch": 2.766569138786414, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.72532272338867, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8581112623214722, "num_tokens": 829907519.0, "step": 21748 }, { "epoch": 2.7666963490650045, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.011940002441406, "learning_rate": 1e-06, "loss": 0.6586, "mean_token_accuracy": 0.8506880402565002, "num_tokens": 829947162.0, "step": 21749 }, { "epoch": 2.766823559343595, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.352481842041016, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8615676164627075, "num_tokens": 829991981.0, "step": 21750 }, { "epoch": 2.7669507696221856, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.98169708251953, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8695980310440063, "num_tokens": 830032409.0, "step": 21751 }, { "epoch": 2.767077979900776, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.36683654785156, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8668868541717529, "num_tokens": 830072738.0, "step": 21752 }, { "epoch": 2.7672051901793666, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.72844314575195, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8588433265686035, "num_tokens": 830110880.0, "step": 21753 }, { "epoch": 2.767332400457957, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.709381103515625, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.857866644859314, "num_tokens": 830150148.0, "step": 21754 }, { "epoch": 2.7674596107365477, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.51823806762695, "learning_rate": 1e-06, "loss": 0.6694, "mean_token_accuracy": 0.8494502305984497, "num_tokens": 830190717.0, "step": 21755 }, { "epoch": 2.767586821015138, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.43669128417969, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8653521537780762, "num_tokens": 830226972.0, "step": 21756 }, { "epoch": 2.7677140312937283, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.864803314208984, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8727956414222717, "num_tokens": 830262612.0, "step": 21757 }, { "epoch": 2.7678412415723193, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.461822509765625, "learning_rate": 1e-06, "loss": 0.5936, "mean_token_accuracy": 0.8685754537582397, "num_tokens": 830306856.0, "step": 21758 }, { "epoch": 2.7679684518509093, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.53887176513672, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8616434335708618, "num_tokens": 830340726.0, "step": 21759 }, { "epoch": 2.7680956621295003, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.89063262939453, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8626051545143127, "num_tokens": 830380697.0, "step": 21760 }, { "epoch": 2.7682228724080904, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.571834564208984, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8407403230667114, "num_tokens": 830414737.0, "step": 21761 }, { "epoch": 2.768350082686681, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.27510070800781, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8648319840431213, "num_tokens": 830456518.0, "step": 21762 }, { "epoch": 2.7684772929652715, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.84035110473633, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8597717881202698, "num_tokens": 830494192.0, "step": 21763 }, { "epoch": 2.768604503243862, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.387672424316406, "learning_rate": 1e-06, "loss": 0.6522, "mean_token_accuracy": 0.8568037748336792, "num_tokens": 830533659.0, "step": 21764 }, { "epoch": 2.7687317135224525, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.87732696533203, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8565074801445007, "num_tokens": 830569605.0, "step": 21765 }, { "epoch": 2.768858923801043, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.28803634643555, "learning_rate": 1e-06, "loss": 0.6884, "mean_token_accuracy": 0.8415306806564331, "num_tokens": 830610814.0, "step": 21766 }, { "epoch": 2.7689861340796336, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.53765106201172, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8630722165107727, "num_tokens": 830646811.0, "step": 21767 }, { "epoch": 2.769113344358224, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.64835739135742, "learning_rate": 1e-06, "loss": 0.6025, "mean_token_accuracy": 0.8677246570587158, "num_tokens": 830680984.0, "step": 21768 }, { "epoch": 2.7692405546368146, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.84810256958008, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8634449243545532, "num_tokens": 830713384.0, "step": 21769 }, { "epoch": 2.769367764915405, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.5782356262207, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8508288860321045, "num_tokens": 830746922.0, "step": 21770 }, { "epoch": 2.7694949751939957, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.00345993041992, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8715238571166992, "num_tokens": 830789459.0, "step": 21771 }, { "epoch": 2.769622185472586, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.723148345947266, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8612179756164551, "num_tokens": 830826736.0, "step": 21772 }, { "epoch": 2.7697493957511767, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.84156799316406, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8652240037918091, "num_tokens": 830868118.0, "step": 21773 }, { "epoch": 2.7698766060297673, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 53.0875129699707, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8598601222038269, "num_tokens": 830911658.0, "step": 21774 }, { "epoch": 2.770003816308358, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.594173431396484, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8630107641220093, "num_tokens": 830948456.0, "step": 21775 }, { "epoch": 2.7701310265869483, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.18291473388672, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8723317384719849, "num_tokens": 830988790.0, "step": 21776 }, { "epoch": 2.770258236865539, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.11258316040039, "learning_rate": 1e-06, "loss": 0.6432, "mean_token_accuracy": 0.8556244969367981, "num_tokens": 831024551.0, "step": 21777 }, { "epoch": 2.7703854471441294, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.38663864135742, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8598103523254395, "num_tokens": 831060082.0, "step": 21778 }, { "epoch": 2.77051265742272, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.39326095581055, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8631850481033325, "num_tokens": 831100067.0, "step": 21779 }, { "epoch": 2.77063986770131, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.60960006713867, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8744185566902161, "num_tokens": 831131923.0, "step": 21780 }, { "epoch": 2.770767077979901, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 52.54795455932617, "learning_rate": 1e-06, "loss": 0.6631, "mean_token_accuracy": 0.8461639881134033, "num_tokens": 831167985.0, "step": 21781 }, { "epoch": 2.770894288258491, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.0761604309082, "learning_rate": 1e-06, "loss": 0.6436, "mean_token_accuracy": 0.8578441143035889, "num_tokens": 831211737.0, "step": 21782 }, { "epoch": 2.771021498537082, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.77864074707031, "learning_rate": 1e-06, "loss": 0.6451, "mean_token_accuracy": 0.8512451648712158, "num_tokens": 831248764.0, "step": 21783 }, { "epoch": 2.771148708815672, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.072845458984375, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8684513568878174, "num_tokens": 831284140.0, "step": 21784 }, { "epoch": 2.771275919094263, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.335693359375, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8663697242736816, "num_tokens": 831318567.0, "step": 21785 }, { "epoch": 2.771403129372853, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.122005462646484, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.856266975402832, "num_tokens": 831357128.0, "step": 21786 }, { "epoch": 2.7715303396514437, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.1259880065918, "learning_rate": 1e-06, "loss": 0.6896, "mean_token_accuracy": 0.8408676385879517, "num_tokens": 831391551.0, "step": 21787 }, { "epoch": 2.771657549930034, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.21079635620117, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8597020506858826, "num_tokens": 831430332.0, "step": 21788 }, { "epoch": 2.7717847602086247, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.476234436035156, "learning_rate": 1e-06, "loss": 0.5939, "mean_token_accuracy": 0.8676388263702393, "num_tokens": 831471639.0, "step": 21789 }, { "epoch": 2.7719119704872153, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.53384780883789, "learning_rate": 1e-06, "loss": 0.5969, "mean_token_accuracy": 0.868005096912384, "num_tokens": 831506470.0, "step": 21790 }, { "epoch": 2.772039180765806, "ewc_loss": 0.2099609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.201995849609375, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8562524318695068, "num_tokens": 831547212.0, "step": 21791 }, { "epoch": 2.7721663910443963, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.51784133911133, "learning_rate": 1e-06, "loss": 0.6621, "mean_token_accuracy": 0.8488286137580872, "num_tokens": 831593181.0, "step": 21792 }, { "epoch": 2.772293601322987, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.349483489990234, "learning_rate": 1e-06, "loss": 0.5544, "mean_token_accuracy": 0.8777420520782471, "num_tokens": 831624889.0, "step": 21793 }, { "epoch": 2.7724208116015774, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.7826042175293, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8700938820838928, "num_tokens": 831662389.0, "step": 21794 }, { "epoch": 2.772548021880168, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.014930725097656, "learning_rate": 1e-06, "loss": 0.604, "mean_token_accuracy": 0.864680826663971, "num_tokens": 831705282.0, "step": 21795 }, { "epoch": 2.7726752321587584, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.505672454833984, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8700499534606934, "num_tokens": 831741604.0, "step": 21796 }, { "epoch": 2.772802442437349, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000186920166015625, "grad_norm": 52.22698211669922, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.8531501293182373, "num_tokens": 831784165.0, "step": 21797 }, { "epoch": 2.7729296527159395, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.274147033691406, "learning_rate": 1e-06, "loss": 0.6085, "mean_token_accuracy": 0.8679898977279663, "num_tokens": 831821101.0, "step": 21798 }, { "epoch": 2.77305686299453, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.16926956176758, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8661057949066162, "num_tokens": 831855766.0, "step": 21799 }, { "epoch": 2.7731840732731206, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.081851959228516, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8671839237213135, "num_tokens": 831895692.0, "step": 21800 }, { "epoch": 2.773311283551711, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.28059768676758, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8821194171905518, "num_tokens": 831931079.0, "step": 21801 }, { "epoch": 2.7734384938303016, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.91545867919922, "learning_rate": 1e-06, "loss": 0.7001, "mean_token_accuracy": 0.8353675603866577, "num_tokens": 831969742.0, "step": 21802 }, { "epoch": 2.773565704108892, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.885414123535156, "learning_rate": 1e-06, "loss": 0.6443, "mean_token_accuracy": 0.8518106937408447, "num_tokens": 832011059.0, "step": 21803 }, { "epoch": 2.7736929143874827, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.1512336730957, "learning_rate": 1e-06, "loss": 0.5876, "mean_token_accuracy": 0.8740320801734924, "num_tokens": 832042768.0, "step": 21804 }, { "epoch": 2.7738201246660728, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.127288818359375, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.870867133140564, "num_tokens": 832077699.0, "step": 21805 }, { "epoch": 2.7739473349446637, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.986427307128906, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8706088662147522, "num_tokens": 832113528.0, "step": 21806 }, { "epoch": 2.774074545223254, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.155113220214844, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8662442564964294, "num_tokens": 832149108.0, "step": 21807 }, { "epoch": 2.774201755501845, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.748695373535156, "learning_rate": 1e-06, "loss": 0.5776, "mean_token_accuracy": 0.8790035247802734, "num_tokens": 832189279.0, "step": 21808 }, { "epoch": 2.774328965780435, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.446022033691406, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8659082055091858, "num_tokens": 832224423.0, "step": 21809 }, { "epoch": 2.774456176059026, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.50398254394531, "learning_rate": 1e-06, "loss": 0.6414, "mean_token_accuracy": 0.8572178483009338, "num_tokens": 832260389.0, "step": 21810 }, { "epoch": 2.774583386337616, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.60189437866211, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8615961074829102, "num_tokens": 832297591.0, "step": 21811 }, { "epoch": 2.7747105966162064, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.16970443725586, "learning_rate": 1e-06, "loss": 0.5994, "mean_token_accuracy": 0.8669445514678955, "num_tokens": 832330195.0, "step": 21812 }, { "epoch": 2.774837806894797, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.736270904541016, "learning_rate": 1e-06, "loss": 0.5901, "mean_token_accuracy": 0.8675947189331055, "num_tokens": 832361613.0, "step": 21813 }, { "epoch": 2.7749650171733875, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.82380676269531, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8688504099845886, "num_tokens": 832402800.0, "step": 21814 }, { "epoch": 2.775092227451978, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.16473388671875, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8647928237915039, "num_tokens": 832445161.0, "step": 21815 }, { "epoch": 2.7752194377305686, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.742454528808594, "learning_rate": 1e-06, "loss": 0.6575, "mean_token_accuracy": 0.848456859588623, "num_tokens": 832487905.0, "step": 21816 }, { "epoch": 2.775346648009159, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.73954772949219, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.8642550110816956, "num_tokens": 832521107.0, "step": 21817 }, { "epoch": 2.7754738582877496, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.711551666259766, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8588771224021912, "num_tokens": 832557213.0, "step": 21818 }, { "epoch": 2.77560106856634, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.9556884765625, "learning_rate": 1e-06, "loss": 0.6585, "mean_token_accuracy": 0.8501908183097839, "num_tokens": 832600445.0, "step": 21819 }, { "epoch": 2.7757282788449307, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.29255676269531, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8632532954216003, "num_tokens": 832635994.0, "step": 21820 }, { "epoch": 2.775855489123521, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.90768814086914, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.874332070350647, "num_tokens": 832677151.0, "step": 21821 }, { "epoch": 2.7759826994021117, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.014469146728516, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8629114627838135, "num_tokens": 832713995.0, "step": 21822 }, { "epoch": 2.7761099096807023, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.46508026123047, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.8537254929542542, "num_tokens": 832754201.0, "step": 21823 }, { "epoch": 2.776237119959293, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.631370544433594, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8595156073570251, "num_tokens": 832796920.0, "step": 21824 }, { "epoch": 2.7763643302378833, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.489601135253906, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8584112524986267, "num_tokens": 832835952.0, "step": 21825 }, { "epoch": 2.776491540516474, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.68515396118164, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8657650351524353, "num_tokens": 832878306.0, "step": 21826 }, { "epoch": 2.7766187507950644, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.6356315612793, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8542720079421997, "num_tokens": 832914315.0, "step": 21827 }, { "epoch": 2.776745961073655, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.78633499145508, "learning_rate": 1e-06, "loss": 0.6679, "mean_token_accuracy": 0.8599395751953125, "num_tokens": 832956661.0, "step": 21828 }, { "epoch": 2.7768731713522454, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.064537048339844, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8687212467193604, "num_tokens": 832999446.0, "step": 21829 }, { "epoch": 2.7770003816308355, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.35978317260742, "learning_rate": 1e-06, "loss": 0.6974, "mean_token_accuracy": 0.8363617062568665, "num_tokens": 833039001.0, "step": 21830 }, { "epoch": 2.7771275919094265, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.2363166809082, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8624729514122009, "num_tokens": 833074334.0, "step": 21831 }, { "epoch": 2.7772548021880166, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.198402404785156, "learning_rate": 1e-06, "loss": 0.6589, "mean_token_accuracy": 0.850975513458252, "num_tokens": 833113089.0, "step": 21832 }, { "epoch": 2.7773820124666075, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.75337219238281, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8614251613616943, "num_tokens": 833149755.0, "step": 21833 }, { "epoch": 2.7775092227451976, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.4405403137207, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8541672229766846, "num_tokens": 833187718.0, "step": 21834 }, { "epoch": 2.7776364330237886, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.4352912902832, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8680763244628906, "num_tokens": 833221416.0, "step": 21835 }, { "epoch": 2.7777636433023787, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.08397674560547, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8766502737998962, "num_tokens": 833261101.0, "step": 21836 }, { "epoch": 2.777890853580969, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.39543914794922, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8705191612243652, "num_tokens": 833296804.0, "step": 21837 }, { "epoch": 2.7780180638595597, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.601646423339844, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8630696535110474, "num_tokens": 833332005.0, "step": 21838 }, { "epoch": 2.7781452741381503, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.964393615722656, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8587407469749451, "num_tokens": 833371991.0, "step": 21839 }, { "epoch": 2.778272484416741, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.45716857910156, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8749147653579712, "num_tokens": 833407514.0, "step": 21840 }, { "epoch": 2.7783996946953313, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.03178787231445, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.8607318997383118, "num_tokens": 833444519.0, "step": 21841 }, { "epoch": 2.778526904973922, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.4619140625, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8593131899833679, "num_tokens": 833482252.0, "step": 21842 }, { "epoch": 2.7786541152525124, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.69528579711914, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8633819818496704, "num_tokens": 833515513.0, "step": 21843 }, { "epoch": 2.778781325531103, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.33625793457031, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8698059916496277, "num_tokens": 833551188.0, "step": 21844 }, { "epoch": 2.7789085358096934, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.865684509277344, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8720059394836426, "num_tokens": 833589159.0, "step": 21845 }, { "epoch": 2.779035746088284, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.45233917236328, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8517895936965942, "num_tokens": 833623581.0, "step": 21846 }, { "epoch": 2.7791629563668745, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.951011657714844, "learning_rate": 1e-06, "loss": 0.6631, "mean_token_accuracy": 0.8507040739059448, "num_tokens": 833660466.0, "step": 21847 }, { "epoch": 2.779290166645465, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.31647491455078, "learning_rate": 1e-06, "loss": 0.6608, "mean_token_accuracy": 0.853126049041748, "num_tokens": 833697465.0, "step": 21848 }, { "epoch": 2.7794173769240555, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.212581634521484, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8602319955825806, "num_tokens": 833735178.0, "step": 21849 }, { "epoch": 2.779544587202646, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.319522857666016, "learning_rate": 1e-06, "loss": 0.5674, "mean_token_accuracy": 0.8788971900939941, "num_tokens": 833770193.0, "step": 21850 }, { "epoch": 2.7796717974812366, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.146339416503906, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8636503219604492, "num_tokens": 833807836.0, "step": 21851 }, { "epoch": 2.779799007759827, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 52.23149108886719, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8590259552001953, "num_tokens": 833847605.0, "step": 21852 }, { "epoch": 2.7799262180384177, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.11872100830078, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8651772737503052, "num_tokens": 833884239.0, "step": 21853 }, { "epoch": 2.780053428317008, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.44831466674805, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8621534109115601, "num_tokens": 833921305.0, "step": 21854 }, { "epoch": 2.7801806385955983, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.00260925292969, "learning_rate": 1e-06, "loss": 0.653, "mean_token_accuracy": 0.8535836935043335, "num_tokens": 833959297.0, "step": 21855 }, { "epoch": 2.7803078488741892, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.47966384887695, "learning_rate": 1e-06, "loss": 0.6996, "mean_token_accuracy": 0.8372714519500732, "num_tokens": 833992135.0, "step": 21856 }, { "epoch": 2.7804350591527793, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.8776969909668, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8523592948913574, "num_tokens": 834035933.0, "step": 21857 }, { "epoch": 2.7805622694313703, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.53450393676758, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8613033294677734, "num_tokens": 834070187.0, "step": 21858 }, { "epoch": 2.7806894797099604, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.03893280029297, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8650965690612793, "num_tokens": 834107370.0, "step": 21859 }, { "epoch": 2.780816689988551, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.385562896728516, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8543373346328735, "num_tokens": 834145357.0, "step": 21860 }, { "epoch": 2.7809439002671414, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.5601921081543, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8609334230422974, "num_tokens": 834187250.0, "step": 21861 }, { "epoch": 2.781071110545732, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 53.0728759765625, "learning_rate": 1e-06, "loss": 0.5836, "mean_token_accuracy": 0.8733500242233276, "num_tokens": 834227598.0, "step": 21862 }, { "epoch": 2.7811983208243225, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.648189544677734, "learning_rate": 1e-06, "loss": 0.5728, "mean_token_accuracy": 0.8765352368354797, "num_tokens": 834267032.0, "step": 21863 }, { "epoch": 2.781325531102913, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.83863830566406, "learning_rate": 1e-06, "loss": 0.6601, "mean_token_accuracy": 0.8498905301094055, "num_tokens": 834308479.0, "step": 21864 }, { "epoch": 2.7814527413815036, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.74298858642578, "learning_rate": 1e-06, "loss": 0.5796, "mean_token_accuracy": 0.8726778626441956, "num_tokens": 834346185.0, "step": 21865 }, { "epoch": 2.781579951660094, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.80110549926758, "learning_rate": 1e-06, "loss": 0.5944, "mean_token_accuracy": 0.8700184226036072, "num_tokens": 834384404.0, "step": 21866 }, { "epoch": 2.7817071619386846, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.95848083496094, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8615382313728333, "num_tokens": 834424833.0, "step": 21867 }, { "epoch": 2.781834372217275, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.563133239746094, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.864474356174469, "num_tokens": 834464215.0, "step": 21868 }, { "epoch": 2.7819615824958657, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.465423583984375, "learning_rate": 1e-06, "loss": 0.6967, "mean_token_accuracy": 0.8394210338592529, "num_tokens": 834502452.0, "step": 21869 }, { "epoch": 2.782088792774456, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.43783187866211, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8638725876808167, "num_tokens": 834539690.0, "step": 21870 }, { "epoch": 2.7822160030530467, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.29264831542969, "learning_rate": 1e-06, "loss": 0.6602, "mean_token_accuracy": 0.8516004681587219, "num_tokens": 834583193.0, "step": 21871 }, { "epoch": 2.7823432133316373, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.417388916015625, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8580765128135681, "num_tokens": 834620028.0, "step": 21872 }, { "epoch": 2.782470423610228, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.63301086425781, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8510231375694275, "num_tokens": 834657351.0, "step": 21873 }, { "epoch": 2.7825976338888183, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.12158966064453, "learning_rate": 1e-06, "loss": 0.5559, "mean_token_accuracy": 0.8771771192550659, "num_tokens": 834693687.0, "step": 21874 }, { "epoch": 2.782724844167409, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.90435028076172, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8618921041488647, "num_tokens": 834736171.0, "step": 21875 }, { "epoch": 2.7828520544459994, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.358192443847656, "learning_rate": 1e-06, "loss": 0.5755, "mean_token_accuracy": 0.8760095238685608, "num_tokens": 834775699.0, "step": 21876 }, { "epoch": 2.78297926472459, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.69648742675781, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8565399646759033, "num_tokens": 834816053.0, "step": 21877 }, { "epoch": 2.78310647500318, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.096561431884766, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8566176891326904, "num_tokens": 834857125.0, "step": 21878 }, { "epoch": 2.783233685281771, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.14994430541992, "learning_rate": 1e-06, "loss": 0.5967, "mean_token_accuracy": 0.8746830224990845, "num_tokens": 834895676.0, "step": 21879 }, { "epoch": 2.783360895560361, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.437896728515625, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8678454756736755, "num_tokens": 834927748.0, "step": 21880 }, { "epoch": 2.783488105838952, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.2745475769043, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.859065055847168, "num_tokens": 834966722.0, "step": 21881 }, { "epoch": 2.783615316117542, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.813255310058594, "learning_rate": 1e-06, "loss": 0.6193, "mean_token_accuracy": 0.8630111217498779, "num_tokens": 835003770.0, "step": 21882 }, { "epoch": 2.783742526396133, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.20835876464844, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8715425729751587, "num_tokens": 835041504.0, "step": 21883 }, { "epoch": 2.783869736674723, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.01015090942383, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8640487790107727, "num_tokens": 835075179.0, "step": 21884 }, { "epoch": 2.7839969469533137, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.181365966796875, "learning_rate": 1e-06, "loss": 0.6923, "mean_token_accuracy": 0.8524585962295532, "num_tokens": 835113080.0, "step": 21885 }, { "epoch": 2.784124157231904, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.945762634277344, "learning_rate": 1e-06, "loss": 0.6194, "mean_token_accuracy": 0.8614692091941833, "num_tokens": 835150586.0, "step": 21886 }, { "epoch": 2.7842513675104947, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.634613037109375, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8593332767486572, "num_tokens": 835184049.0, "step": 21887 }, { "epoch": 2.7843785777890853, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.55904769897461, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8587144613265991, "num_tokens": 835222745.0, "step": 21888 }, { "epoch": 2.784505788067676, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.2875862121582, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.8506883382797241, "num_tokens": 835263017.0, "step": 21889 }, { "epoch": 2.7846329983462663, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.68266296386719, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8765577077865601, "num_tokens": 835303916.0, "step": 21890 }, { "epoch": 2.784760208624857, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.78383255004883, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8766761422157288, "num_tokens": 835343148.0, "step": 21891 }, { "epoch": 2.7848874189034474, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.6047248840332, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8601353168487549, "num_tokens": 835382337.0, "step": 21892 }, { "epoch": 2.785014629182038, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.18059539794922, "learning_rate": 1e-06, "loss": 0.6486, "mean_token_accuracy": 0.8569214344024658, "num_tokens": 835423645.0, "step": 21893 }, { "epoch": 2.7851418394606284, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.34394073486328, "learning_rate": 1e-06, "loss": 0.7002, "mean_token_accuracy": 0.8332515954971313, "num_tokens": 835464590.0, "step": 21894 }, { "epoch": 2.785269049739219, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.264122009277344, "learning_rate": 1e-06, "loss": 0.5547, "mean_token_accuracy": 0.8877534866333008, "num_tokens": 835496376.0, "step": 21895 }, { "epoch": 2.7853962600178095, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.956451416015625, "learning_rate": 1e-06, "loss": 0.6695, "mean_token_accuracy": 0.8473184108734131, "num_tokens": 835541231.0, "step": 21896 }, { "epoch": 2.7855234702964, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.82249069213867, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.864907443523407, "num_tokens": 835584038.0, "step": 21897 }, { "epoch": 2.7856506805749905, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.466888427734375, "learning_rate": 1e-06, "loss": 0.6439, "mean_token_accuracy": 0.8569961786270142, "num_tokens": 835627189.0, "step": 21898 }, { "epoch": 2.785777890853581, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.69614028930664, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8631306886672974, "num_tokens": 835659701.0, "step": 21899 }, { "epoch": 2.7859051011321716, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.424041748046875, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8682422637939453, "num_tokens": 835694045.0, "step": 21900 }, { "epoch": 2.786032311410762, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.4853515625, "learning_rate": 1e-06, "loss": 0.6342, "mean_token_accuracy": 0.8606927394866943, "num_tokens": 835732118.0, "step": 21901 }, { "epoch": 2.7861595216893527, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.529083251953125, "learning_rate": 1e-06, "loss": 0.602, "mean_token_accuracy": 0.8715543746948242, "num_tokens": 835772700.0, "step": 21902 }, { "epoch": 2.7862867319679427, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.321109771728516, "learning_rate": 1e-06, "loss": 0.6927, "mean_token_accuracy": 0.8456695675849915, "num_tokens": 835811206.0, "step": 21903 }, { "epoch": 2.7864139422465337, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.06156921386719, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8673701882362366, "num_tokens": 835844381.0, "step": 21904 }, { "epoch": 2.786541152525124, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.2934455871582, "learning_rate": 1e-06, "loss": 0.6481, "mean_token_accuracy": 0.8536900281906128, "num_tokens": 835879926.0, "step": 21905 }, { "epoch": 2.7866683628037148, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.02226638793945, "learning_rate": 1e-06, "loss": 0.662, "mean_token_accuracy": 0.8514673113822937, "num_tokens": 835924014.0, "step": 21906 }, { "epoch": 2.786795573082305, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.05552291870117, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8559390902519226, "num_tokens": 835956054.0, "step": 21907 }, { "epoch": 2.786922783360896, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.07145690917969, "learning_rate": 1e-06, "loss": 0.6088, "mean_token_accuracy": 0.8682741522789001, "num_tokens": 835988885.0, "step": 21908 }, { "epoch": 2.787049993639486, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.76151657104492, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8613657355308533, "num_tokens": 836028885.0, "step": 21909 }, { "epoch": 2.7871772039180764, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.552799224853516, "learning_rate": 1e-06, "loss": 0.5892, "mean_token_accuracy": 0.8753423690795898, "num_tokens": 836067340.0, "step": 21910 }, { "epoch": 2.787304414196667, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.76608657836914, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8623948693275452, "num_tokens": 836104432.0, "step": 21911 }, { "epoch": 2.7874316244752575, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.272953033447266, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8583022952079773, "num_tokens": 836141197.0, "step": 21912 }, { "epoch": 2.787558834753848, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.705238342285156, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.875257670879364, "num_tokens": 836176949.0, "step": 21913 }, { "epoch": 2.7876860450324386, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.485225677490234, "learning_rate": 1e-06, "loss": 0.6394, "mean_token_accuracy": 0.8611892461776733, "num_tokens": 836206484.0, "step": 21914 }, { "epoch": 2.787813255311029, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.16850280761719, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8678135871887207, "num_tokens": 836243056.0, "step": 21915 }, { "epoch": 2.7879404655896196, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.2970085144043, "learning_rate": 1e-06, "loss": 0.6485, "mean_token_accuracy": 0.8578577637672424, "num_tokens": 836281154.0, "step": 21916 }, { "epoch": 2.78806767586821, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.142398834228516, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8621466159820557, "num_tokens": 836319895.0, "step": 21917 }, { "epoch": 2.7881948861468007, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.87957000732422, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8675186634063721, "num_tokens": 836357227.0, "step": 21918 }, { "epoch": 2.788322096425391, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.18708419799805, "learning_rate": 1e-06, "loss": 0.6496, "mean_token_accuracy": 0.8537884950637817, "num_tokens": 836398442.0, "step": 21919 }, { "epoch": 2.7884493067039817, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.88441467285156, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8703954219818115, "num_tokens": 836436660.0, "step": 21920 }, { "epoch": 2.7885765169825723, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.183067321777344, "learning_rate": 1e-06, "loss": 0.6521, "mean_token_accuracy": 0.8559653162956238, "num_tokens": 836479180.0, "step": 21921 }, { "epoch": 2.788703727261163, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.875, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8696532249450684, "num_tokens": 836518874.0, "step": 21922 }, { "epoch": 2.7888309375397533, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.92689514160156, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8581945896148682, "num_tokens": 836556910.0, "step": 21923 }, { "epoch": 2.788958147818344, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.22308349609375, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8745187520980835, "num_tokens": 836598151.0, "step": 21924 }, { "epoch": 2.7890853580969344, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.87138748168945, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8542871475219727, "num_tokens": 836637952.0, "step": 21925 }, { "epoch": 2.789212568375525, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.46428680419922, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8760300874710083, "num_tokens": 836677138.0, "step": 21926 }, { "epoch": 2.7893397786541154, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.04601287841797, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8673865795135498, "num_tokens": 836716880.0, "step": 21927 }, { "epoch": 2.7894669889327055, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.730533599853516, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8770630359649658, "num_tokens": 836752572.0, "step": 21928 }, { "epoch": 2.7895941992112965, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.8024787902832, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8589751720428467, "num_tokens": 836794321.0, "step": 21929 }, { "epoch": 2.7897214094898866, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.5219612121582, "learning_rate": 1e-06, "loss": 0.5725, "mean_token_accuracy": 0.8745086789131165, "num_tokens": 836829183.0, "step": 21930 }, { "epoch": 2.7898486197684775, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.269676208496094, "learning_rate": 1e-06, "loss": 0.6584, "mean_token_accuracy": 0.8505740761756897, "num_tokens": 836864644.0, "step": 21931 }, { "epoch": 2.7899758300470676, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.27052307128906, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8748770952224731, "num_tokens": 836895384.0, "step": 21932 }, { "epoch": 2.7901030403256586, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.059059143066406, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8649020195007324, "num_tokens": 836943336.0, "step": 21933 }, { "epoch": 2.7902302506042487, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.666114807128906, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8792080283164978, "num_tokens": 836980708.0, "step": 21934 }, { "epoch": 2.790357460882839, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.09134292602539, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8668642044067383, "num_tokens": 837015738.0, "step": 21935 }, { "epoch": 2.7904846711614297, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.56206130981445, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8700565099716187, "num_tokens": 837053014.0, "step": 21936 }, { "epoch": 2.7906118814400203, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.163963317871094, "learning_rate": 1e-06, "loss": 0.6518, "mean_token_accuracy": 0.8586527109146118, "num_tokens": 837083579.0, "step": 21937 }, { "epoch": 2.790739091718611, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.391143798828125, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8727859854698181, "num_tokens": 837126201.0, "step": 21938 }, { "epoch": 2.7908663019972013, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.3283805847168, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8729467988014221, "num_tokens": 837162400.0, "step": 21939 }, { "epoch": 2.790993512275792, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.37308120727539, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8673011660575867, "num_tokens": 837205733.0, "step": 21940 }, { "epoch": 2.7911207225543824, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.34807205200195, "learning_rate": 1e-06, "loss": 0.6586, "mean_token_accuracy": 0.8532871007919312, "num_tokens": 837243838.0, "step": 21941 }, { "epoch": 2.791247932832973, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 51.8720703125, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.858312726020813, "num_tokens": 837280936.0, "step": 21942 }, { "epoch": 2.7913751431115634, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.81446838378906, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8622277975082397, "num_tokens": 837321790.0, "step": 21943 }, { "epoch": 2.791502353390154, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.552223205566406, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8676818609237671, "num_tokens": 837362023.0, "step": 21944 }, { "epoch": 2.7916295636687445, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.2386360168457, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.870907723903656, "num_tokens": 837404876.0, "step": 21945 }, { "epoch": 2.791756773947335, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.391807556152344, "learning_rate": 1e-06, "loss": 0.6428, "mean_token_accuracy": 0.8542526960372925, "num_tokens": 837442899.0, "step": 21946 }, { "epoch": 2.7918839842259255, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.41447067260742, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8721761703491211, "num_tokens": 837477611.0, "step": 21947 }, { "epoch": 2.792011194504516, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.5330810546875, "learning_rate": 1e-06, "loss": 0.6639, "mean_token_accuracy": 0.8493147492408752, "num_tokens": 837515928.0, "step": 21948 }, { "epoch": 2.7921384047831066, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.41889953613281, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.862114667892456, "num_tokens": 837553042.0, "step": 21949 }, { "epoch": 2.792265615061697, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.56686019897461, "learning_rate": 1e-06, "loss": 0.6135, "mean_token_accuracy": 0.8692805767059326, "num_tokens": 837589749.0, "step": 21950 }, { "epoch": 2.7923928253402877, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.558448791503906, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.881812334060669, "num_tokens": 837631104.0, "step": 21951 }, { "epoch": 2.792520035618878, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.393951416015625, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.873296856880188, "num_tokens": 837668173.0, "step": 21952 }, { "epoch": 2.7926472458974683, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.38308334350586, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8695380091667175, "num_tokens": 837704282.0, "step": 21953 }, { "epoch": 2.7927744561760592, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.224891662597656, "learning_rate": 1e-06, "loss": 0.6614, "mean_token_accuracy": 0.8534588813781738, "num_tokens": 837750790.0, "step": 21954 }, { "epoch": 2.7929016664546493, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 51.930850982666016, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8605036735534668, "num_tokens": 837792041.0, "step": 21955 }, { "epoch": 2.7930288767332403, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.0951042175293, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8710941672325134, "num_tokens": 837818645.0, "step": 21956 }, { "epoch": 2.7931560870118304, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.11326217651367, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8723694086074829, "num_tokens": 837859816.0, "step": 21957 }, { "epoch": 2.793283297290421, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.89794158935547, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8569127917289734, "num_tokens": 837903970.0, "step": 21958 }, { "epoch": 2.7934105075690114, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.558467864990234, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8620550632476807, "num_tokens": 837941158.0, "step": 21959 }, { "epoch": 2.793537717847602, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.08097457885742, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8697360157966614, "num_tokens": 837978157.0, "step": 21960 }, { "epoch": 2.7936649281261925, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.375267028808594, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.875099778175354, "num_tokens": 838017871.0, "step": 21961 }, { "epoch": 2.793792138404783, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.074092864990234, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8750307559967041, "num_tokens": 838050521.0, "step": 21962 }, { "epoch": 2.7939193486833735, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.5450325012207, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8720547556877136, "num_tokens": 838084450.0, "step": 21963 }, { "epoch": 2.794046558961964, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.13337707519531, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8569964170455933, "num_tokens": 838121505.0, "step": 21964 }, { "epoch": 2.7941737692405546, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.60285568237305, "learning_rate": 1e-06, "loss": 0.6214, "mean_token_accuracy": 0.8620790243148804, "num_tokens": 838166148.0, "step": 21965 }, { "epoch": 2.794300979519145, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.89760208129883, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8686328530311584, "num_tokens": 838203578.0, "step": 21966 }, { "epoch": 2.7944281897977357, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.82609558105469, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8669261932373047, "num_tokens": 838240702.0, "step": 21967 }, { "epoch": 2.794555400076326, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.55173110961914, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8699589967727661, "num_tokens": 838275639.0, "step": 21968 }, { "epoch": 2.7946826103549167, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.030731201171875, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8647198677062988, "num_tokens": 838310045.0, "step": 21969 }, { "epoch": 2.7948098206335072, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.506229400634766, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8646941184997559, "num_tokens": 838351571.0, "step": 21970 }, { "epoch": 2.7949370309120978, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.258872985839844, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8768905401229858, "num_tokens": 838393804.0, "step": 21971 }, { "epoch": 2.7950642411906883, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.993247985839844, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8647695183753967, "num_tokens": 838435516.0, "step": 21972 }, { "epoch": 2.795191451469279, "ewc_loss": 0.21875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.14908981323242, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8704447746276855, "num_tokens": 838465425.0, "step": 21973 }, { "epoch": 2.7953186617478694, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.93046951293945, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8617407083511353, "num_tokens": 838512832.0, "step": 21974 }, { "epoch": 2.79544587202646, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.780094146728516, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8706055879592896, "num_tokens": 838552806.0, "step": 21975 }, { "epoch": 2.79557308230505, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.82838439941406, "learning_rate": 1e-06, "loss": 0.6343, "mean_token_accuracy": 0.8619000911712646, "num_tokens": 838588592.0, "step": 21976 }, { "epoch": 2.795700292583641, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.65272903442383, "learning_rate": 1e-06, "loss": 0.6279, "mean_token_accuracy": 0.8613170385360718, "num_tokens": 838629402.0, "step": 21977 }, { "epoch": 2.795827502862231, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.23384475708008, "learning_rate": 1e-06, "loss": 0.6604, "mean_token_accuracy": 0.8548052310943604, "num_tokens": 838676580.0, "step": 21978 }, { "epoch": 2.795954713140822, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.8499641418457, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8574120998382568, "num_tokens": 838715773.0, "step": 21979 }, { "epoch": 2.796081923419412, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.62567901611328, "learning_rate": 1e-06, "loss": 0.6658, "mean_token_accuracy": 0.8518650531768799, "num_tokens": 838750270.0, "step": 21980 }, { "epoch": 2.796209133698003, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.42234420776367, "learning_rate": 1e-06, "loss": 0.6788, "mean_token_accuracy": 0.8423513770103455, "num_tokens": 838794823.0, "step": 21981 }, { "epoch": 2.796336343976593, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.607723236083984, "learning_rate": 1e-06, "loss": 0.679, "mean_token_accuracy": 0.8530830144882202, "num_tokens": 838828901.0, "step": 21982 }, { "epoch": 2.7964635542551837, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.249019622802734, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8553446531295776, "num_tokens": 838872648.0, "step": 21983 }, { "epoch": 2.796590764533774, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.14118957519531, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.8748651146888733, "num_tokens": 838905150.0, "step": 21984 }, { "epoch": 2.7967179748123647, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.6996955871582, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8641541004180908, "num_tokens": 838949027.0, "step": 21985 }, { "epoch": 2.7968451850909553, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.03105926513672, "learning_rate": 1e-06, "loss": 0.6256, "mean_token_accuracy": 0.8647069334983826, "num_tokens": 838987320.0, "step": 21986 }, { "epoch": 2.796972395369546, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.192604064941406, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8586957454681396, "num_tokens": 839027396.0, "step": 21987 }, { "epoch": 2.7970996056481363, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.40035629272461, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8693690299987793, "num_tokens": 839069779.0, "step": 21988 }, { "epoch": 2.797226815926727, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.479434967041016, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8703247308731079, "num_tokens": 839110919.0, "step": 21989 }, { "epoch": 2.7973540262053174, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.0904541015625, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8775929808616638, "num_tokens": 839143689.0, "step": 21990 }, { "epoch": 2.797481236483908, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.85349655151367, "learning_rate": 1e-06, "loss": 0.691, "mean_token_accuracy": 0.8435195088386536, "num_tokens": 839181274.0, "step": 21991 }, { "epoch": 2.7976084467624984, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.40748977661133, "learning_rate": 1e-06, "loss": 0.671, "mean_token_accuracy": 0.8471426367759705, "num_tokens": 839217038.0, "step": 21992 }, { "epoch": 2.797735657041089, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.86410140991211, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8636939525604248, "num_tokens": 839260104.0, "step": 21993 }, { "epoch": 2.7978628673196795, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.22091293334961, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8663855791091919, "num_tokens": 839294328.0, "step": 21994 }, { "epoch": 2.79799007759827, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.67073440551758, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8623189926147461, "num_tokens": 839326662.0, "step": 21995 }, { "epoch": 2.7981172878768605, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.0029182434082, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.863959789276123, "num_tokens": 839355997.0, "step": 21996 }, { "epoch": 2.798244498155451, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.94123458862305, "learning_rate": 1e-06, "loss": 0.6073, "mean_token_accuracy": 0.8694380521774292, "num_tokens": 839396596.0, "step": 21997 }, { "epoch": 2.7983717084340416, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.19397735595703, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8662260174751282, "num_tokens": 839432251.0, "step": 21998 }, { "epoch": 2.798498918712632, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.969642639160156, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8725159168243408, "num_tokens": 839465987.0, "step": 21999 }, { "epoch": 2.7986261289912227, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.24662780761719, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8650979995727539, "num_tokens": 839506147.0, "step": 22000 }, { "epoch": 2.7987533392698127, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.36929702758789, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8566357493400574, "num_tokens": 839544620.0, "step": 22001 }, { "epoch": 2.7988805495484037, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.35070037841797, "learning_rate": 1e-06, "loss": 0.6588, "mean_token_accuracy": 0.8539642095565796, "num_tokens": 839583217.0, "step": 22002 }, { "epoch": 2.799007759826994, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.539344787597656, "learning_rate": 1e-06, "loss": 0.5735, "mean_token_accuracy": 0.8832883834838867, "num_tokens": 839621283.0, "step": 22003 }, { "epoch": 2.7991349701055848, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.483604431152344, "learning_rate": 1e-06, "loss": 0.6415, "mean_token_accuracy": 0.8592520952224731, "num_tokens": 839658072.0, "step": 22004 }, { "epoch": 2.799262180384175, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.2997932434082, "learning_rate": 1e-06, "loss": 0.6814, "mean_token_accuracy": 0.8464124202728271, "num_tokens": 839700926.0, "step": 22005 }, { "epoch": 2.799389390662766, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.884098052978516, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8722453117370605, "num_tokens": 839736160.0, "step": 22006 }, { "epoch": 2.799516600941356, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.52775192260742, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.879599928855896, "num_tokens": 839776834.0, "step": 22007 }, { "epoch": 2.7996438112199464, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.730316162109375, "learning_rate": 1e-06, "loss": 0.6612, "mean_token_accuracy": 0.8560042977333069, "num_tokens": 839818156.0, "step": 22008 }, { "epoch": 2.799771021498537, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.44374084472656, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8581134080886841, "num_tokens": 839857133.0, "step": 22009 }, { "epoch": 2.7998982317771275, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.057594299316406, "learning_rate": 1e-06, "loss": 0.6625, "mean_token_accuracy": 0.847903311252594, "num_tokens": 839893129.0, "step": 22010 }, { "epoch": 2.800025442055718, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.29386901855469, "learning_rate": 1e-06, "loss": 0.6131, "mean_token_accuracy": 0.8662098050117493, "num_tokens": 839924622.0, "step": 22011 }, { "epoch": 2.8001526523343085, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.07866668701172, "learning_rate": 1e-06, "loss": 0.5569, "mean_token_accuracy": 0.8820291757583618, "num_tokens": 839961276.0, "step": 22012 }, { "epoch": 2.800279862612899, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.81395721435547, "learning_rate": 1e-06, "loss": 0.6484, "mean_token_accuracy": 0.8535829782485962, "num_tokens": 840002554.0, "step": 22013 }, { "epoch": 2.8004070728914896, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.26732635498047, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8629572987556458, "num_tokens": 840041642.0, "step": 22014 }, { "epoch": 2.80053428317008, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.03696823120117, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8722213506698608, "num_tokens": 840083532.0, "step": 22015 }, { "epoch": 2.8006614934486707, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.11898422241211, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8600870370864868, "num_tokens": 840124221.0, "step": 22016 }, { "epoch": 2.800788703727261, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.90562438964844, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8687373399734497, "num_tokens": 840162309.0, "step": 22017 }, { "epoch": 2.8009159140058517, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.56538009643555, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8630700707435608, "num_tokens": 840194778.0, "step": 22018 }, { "epoch": 2.8010431242844422, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.76090621948242, "learning_rate": 1e-06, "loss": 0.5685, "mean_token_accuracy": 0.8805186748504639, "num_tokens": 840225081.0, "step": 22019 }, { "epoch": 2.8011703345630328, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.60829544067383, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8629852533340454, "num_tokens": 840267698.0, "step": 22020 }, { "epoch": 2.8012975448416233, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 54.10449981689453, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.856140673160553, "num_tokens": 840303255.0, "step": 22021 }, { "epoch": 2.801424755120214, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.22136306762695, "learning_rate": 1e-06, "loss": 0.6136, "mean_token_accuracy": 0.8616269826889038, "num_tokens": 840344616.0, "step": 22022 }, { "epoch": 2.8015519653988044, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.609832763671875, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8643267750740051, "num_tokens": 840383293.0, "step": 22023 }, { "epoch": 2.801679175677395, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001888275146484375, "grad_norm": 51.48713302612305, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.8523815870285034, "num_tokens": 840424583.0, "step": 22024 }, { "epoch": 2.8018063859559854, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 54.7980842590332, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.8560616374015808, "num_tokens": 840458752.0, "step": 22025 }, { "epoch": 2.8019335962345755, "ewc_loss": 0.2109375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.2958869934082, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8689422607421875, "num_tokens": 840498441.0, "step": 22026 }, { "epoch": 2.8020608065131665, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.00713348388672, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8734703063964844, "num_tokens": 840537838.0, "step": 22027 }, { "epoch": 2.8021880167917566, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.01603698730469, "learning_rate": 1e-06, "loss": 0.6507, "mean_token_accuracy": 0.8507083654403687, "num_tokens": 840571770.0, "step": 22028 }, { "epoch": 2.8023152270703475, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.808650970458984, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8569748401641846, "num_tokens": 840607559.0, "step": 22029 }, { "epoch": 2.8024424373489376, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.20136260986328, "learning_rate": 1e-06, "loss": 0.6644, "mean_token_accuracy": 0.8481093645095825, "num_tokens": 840650156.0, "step": 22030 }, { "epoch": 2.8025696476275286, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.20518493652344, "learning_rate": 1e-06, "loss": 0.6622, "mean_token_accuracy": 0.8553929328918457, "num_tokens": 840686590.0, "step": 22031 }, { "epoch": 2.8026968579061187, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.39236831665039, "learning_rate": 1e-06, "loss": 0.6516, "mean_token_accuracy": 0.8553135991096497, "num_tokens": 840724445.0, "step": 22032 }, { "epoch": 2.802824068184709, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.8018798828125, "learning_rate": 1e-06, "loss": 0.5747, "mean_token_accuracy": 0.8772809505462646, "num_tokens": 840765047.0, "step": 22033 }, { "epoch": 2.8029512784632997, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.85250473022461, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8507777452468872, "num_tokens": 840802375.0, "step": 22034 }, { "epoch": 2.8030784887418903, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.22742462158203, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8615134954452515, "num_tokens": 840839994.0, "step": 22035 }, { "epoch": 2.803205699020481, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.77173614501953, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8608643412590027, "num_tokens": 840877144.0, "step": 22036 }, { "epoch": 2.8033329092990713, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.65375518798828, "learning_rate": 1e-06, "loss": 0.6646, "mean_token_accuracy": 0.8535997867584229, "num_tokens": 840918478.0, "step": 22037 }, { "epoch": 2.803460119577662, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.02165222167969, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8735197186470032, "num_tokens": 840958484.0, "step": 22038 }, { "epoch": 2.8035873298562524, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.236576080322266, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8662980794906616, "num_tokens": 841000035.0, "step": 22039 }, { "epoch": 2.803714540134843, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.00081253051758, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8566706776618958, "num_tokens": 841035836.0, "step": 22040 }, { "epoch": 2.8038417504134334, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.54001235961914, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.8655311465263367, "num_tokens": 841069295.0, "step": 22041 }, { "epoch": 2.803968960692024, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.31718826293945, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8576644659042358, "num_tokens": 841108341.0, "step": 22042 }, { "epoch": 2.8040961709706145, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.08818054199219, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8730362057685852, "num_tokens": 841142945.0, "step": 22043 }, { "epoch": 2.804223381249205, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.756202697753906, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8756992816925049, "num_tokens": 841174137.0, "step": 22044 }, { "epoch": 2.8043505915277955, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.816490173339844, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8727937936782837, "num_tokens": 841215914.0, "step": 22045 }, { "epoch": 2.804477801806386, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.69010543823242, "learning_rate": 1e-06, "loss": 0.6127, "mean_token_accuracy": 0.8645756244659424, "num_tokens": 841256097.0, "step": 22046 }, { "epoch": 2.8046050120849766, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.987125396728516, "learning_rate": 1e-06, "loss": 0.6175, "mean_token_accuracy": 0.868270993232727, "num_tokens": 841288997.0, "step": 22047 }, { "epoch": 2.804732222363567, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.60006332397461, "learning_rate": 1e-06, "loss": 0.6426, "mean_token_accuracy": 0.8585061430931091, "num_tokens": 841328053.0, "step": 22048 }, { "epoch": 2.8048594326421576, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.95206069946289, "learning_rate": 1e-06, "loss": 0.5663, "mean_token_accuracy": 0.8827980756759644, "num_tokens": 841366848.0, "step": 22049 }, { "epoch": 2.804986642920748, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.0863037109375, "learning_rate": 1e-06, "loss": 0.5882, "mean_token_accuracy": 0.871602475643158, "num_tokens": 841398568.0, "step": 22050 }, { "epoch": 2.8051138531993383, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.26626205444336, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8597794771194458, "num_tokens": 841435722.0, "step": 22051 }, { "epoch": 2.8052410634779292, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.29925537109375, "learning_rate": 1e-06, "loss": 0.6437, "mean_token_accuracy": 0.8590865135192871, "num_tokens": 841473635.0, "step": 22052 }, { "epoch": 2.8053682737565193, "ewc_loss": 0.212890625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.083396911621094, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8662405014038086, "num_tokens": 841512383.0, "step": 22053 }, { "epoch": 2.8054954840351103, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.76576232910156, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8562489748001099, "num_tokens": 841543174.0, "step": 22054 }, { "epoch": 2.8056226943137004, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.026729583740234, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.868399977684021, "num_tokens": 841582185.0, "step": 22055 }, { "epoch": 2.805749904592291, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.12437057495117, "learning_rate": 1e-06, "loss": 0.6352, "mean_token_accuracy": 0.8567534685134888, "num_tokens": 841623179.0, "step": 22056 }, { "epoch": 2.8058771148708814, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.81674575805664, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8725039958953857, "num_tokens": 841661062.0, "step": 22057 }, { "epoch": 2.806004325149472, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.6745719909668, "learning_rate": 1e-06, "loss": 0.622, "mean_token_accuracy": 0.859489917755127, "num_tokens": 841704259.0, "step": 22058 }, { "epoch": 2.8061315354280625, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.29001998901367, "learning_rate": 1e-06, "loss": 0.6361, "mean_token_accuracy": 0.8605350852012634, "num_tokens": 841748396.0, "step": 22059 }, { "epoch": 2.806258745706653, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.76689529418945, "learning_rate": 1e-06, "loss": 0.5543, "mean_token_accuracy": 0.8822227716445923, "num_tokens": 841787809.0, "step": 22060 }, { "epoch": 2.8063859559852435, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.90471649169922, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8630779385566711, "num_tokens": 841824046.0, "step": 22061 }, { "epoch": 2.806513166263834, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 51.94752502441406, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8721745014190674, "num_tokens": 841856949.0, "step": 22062 }, { "epoch": 2.8066403765424246, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.80404281616211, "learning_rate": 1e-06, "loss": 0.6776, "mean_token_accuracy": 0.8487904071807861, "num_tokens": 841898644.0, "step": 22063 }, { "epoch": 2.806767586821015, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.5198860168457, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8663856387138367, "num_tokens": 841934795.0, "step": 22064 }, { "epoch": 2.8068947970996057, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.722476959228516, "learning_rate": 1e-06, "loss": 0.6805, "mean_token_accuracy": 0.8481139540672302, "num_tokens": 841976878.0, "step": 22065 }, { "epoch": 2.807022007378196, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.26262664794922, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.864649772644043, "num_tokens": 842010998.0, "step": 22066 }, { "epoch": 2.8071492176567867, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.4097785949707, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8890445232391357, "num_tokens": 842051410.0, "step": 22067 }, { "epoch": 2.8072764279353772, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.69790267944336, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8663796186447144, "num_tokens": 842087954.0, "step": 22068 }, { "epoch": 2.8074036382139678, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.87834548950195, "learning_rate": 1e-06, "loss": 0.635, "mean_token_accuracy": 0.8604902625083923, "num_tokens": 842128326.0, "step": 22069 }, { "epoch": 2.8075308484925583, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.8795166015625, "learning_rate": 1e-06, "loss": 0.6914, "mean_token_accuracy": 0.8424508571624756, "num_tokens": 842171143.0, "step": 22070 }, { "epoch": 2.807658058771149, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.6363525390625, "learning_rate": 1e-06, "loss": 0.6619, "mean_token_accuracy": 0.8502432703971863, "num_tokens": 842212167.0, "step": 22071 }, { "epoch": 2.8077852690497394, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.18824768066406, "learning_rate": 1e-06, "loss": 0.6475, "mean_token_accuracy": 0.8540872931480408, "num_tokens": 842252576.0, "step": 22072 }, { "epoch": 2.80791247932833, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.65582275390625, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8590102195739746, "num_tokens": 842287812.0, "step": 22073 }, { "epoch": 2.80803968960692, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.464141845703125, "learning_rate": 1e-06, "loss": 0.6396, "mean_token_accuracy": 0.859604001045227, "num_tokens": 842328289.0, "step": 22074 }, { "epoch": 2.808166899885511, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.826438903808594, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8652053475379944, "num_tokens": 842359182.0, "step": 22075 }, { "epoch": 2.808294110164101, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.21984100341797, "learning_rate": 1e-06, "loss": 0.6629, "mean_token_accuracy": 0.855277955532074, "num_tokens": 842399820.0, "step": 22076 }, { "epoch": 2.808421320442692, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.5710563659668, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8611578941345215, "num_tokens": 842437852.0, "step": 22077 }, { "epoch": 2.808548530721282, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.30103302001953, "learning_rate": 1e-06, "loss": 0.665, "mean_token_accuracy": 0.848904013633728, "num_tokens": 842476729.0, "step": 22078 }, { "epoch": 2.808675740999873, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.51953125, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8515743017196655, "num_tokens": 842514590.0, "step": 22079 }, { "epoch": 2.808802951278463, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.29256057739258, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.8619526624679565, "num_tokens": 842556502.0, "step": 22080 }, { "epoch": 2.8089301615570537, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.42100524902344, "learning_rate": 1e-06, "loss": 0.6605, "mean_token_accuracy": 0.8509854078292847, "num_tokens": 842594299.0, "step": 22081 }, { "epoch": 2.809057371835644, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.31755065917969, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8603289127349854, "num_tokens": 842633189.0, "step": 22082 }, { "epoch": 2.8091845821142347, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.182090759277344, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8650639057159424, "num_tokens": 842673626.0, "step": 22083 }, { "epoch": 2.8093117923928252, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.03962707519531, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8559378385543823, "num_tokens": 842712574.0, "step": 22084 }, { "epoch": 2.8094390026714158, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.41375732421875, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8611475229263306, "num_tokens": 842749534.0, "step": 22085 }, { "epoch": 2.8095662129500063, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.51213455200195, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.8520380854606628, "num_tokens": 842784989.0, "step": 22086 }, { "epoch": 2.809693423228597, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.993995666503906, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8633943796157837, "num_tokens": 842823858.0, "step": 22087 }, { "epoch": 2.8098206335071874, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.865718841552734, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8632963299751282, "num_tokens": 842858849.0, "step": 22088 }, { "epoch": 2.809947843785778, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.81492233276367, "learning_rate": 1e-06, "loss": 0.5567, "mean_token_accuracy": 0.8831509351730347, "num_tokens": 842895809.0, "step": 22089 }, { "epoch": 2.8100750540643684, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.66276168823242, "learning_rate": 1e-06, "loss": 0.5383, "mean_token_accuracy": 0.8909174203872681, "num_tokens": 842933650.0, "step": 22090 }, { "epoch": 2.810202264342959, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.34220504760742, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8680453896522522, "num_tokens": 842967200.0, "step": 22091 }, { "epoch": 2.8103294746215495, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.45931625366211, "learning_rate": 1e-06, "loss": 0.644, "mean_token_accuracy": 0.8537166118621826, "num_tokens": 843002498.0, "step": 22092 }, { "epoch": 2.81045668490014, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.583763122558594, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8671450614929199, "num_tokens": 843042627.0, "step": 22093 }, { "epoch": 2.8105838951787305, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.52102279663086, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8586667776107788, "num_tokens": 843076862.0, "step": 22094 }, { "epoch": 2.810711105457321, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.25125503540039, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8623376488685608, "num_tokens": 843115000.0, "step": 22095 }, { "epoch": 2.8108383157359116, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.229679107666016, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.8694251179695129, "num_tokens": 843152408.0, "step": 22096 }, { "epoch": 2.810965526014502, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.324825286865234, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.8647692203521729, "num_tokens": 843189269.0, "step": 22097 }, { "epoch": 2.8110927362930926, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.40187454223633, "learning_rate": 1e-06, "loss": 0.6324, "mean_token_accuracy": 0.857909083366394, "num_tokens": 843227148.0, "step": 22098 }, { "epoch": 2.8112199465716827, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.60126876831055, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8519070148468018, "num_tokens": 843259736.0, "step": 22099 }, { "epoch": 2.8113471568502737, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.17827606201172, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8591835498809814, "num_tokens": 843302598.0, "step": 22100 }, { "epoch": 2.811474367128864, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.14034652709961, "learning_rate": 1e-06, "loss": 0.5793, "mean_token_accuracy": 0.8796122074127197, "num_tokens": 843337615.0, "step": 22101 }, { "epoch": 2.8116015774074548, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.852725982666016, "learning_rate": 1e-06, "loss": 0.5749, "mean_token_accuracy": 0.879726767539978, "num_tokens": 843378845.0, "step": 22102 }, { "epoch": 2.811728787686045, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.299983978271484, "learning_rate": 1e-06, "loss": 0.6676, "mean_token_accuracy": 0.8514117002487183, "num_tokens": 843414128.0, "step": 22103 }, { "epoch": 2.811855997964636, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.44099426269531, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8800777792930603, "num_tokens": 843454255.0, "step": 22104 }, { "epoch": 2.811983208243226, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.38661575317383, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8629348278045654, "num_tokens": 843498332.0, "step": 22105 }, { "epoch": 2.8121104185218164, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.29249954223633, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.8542367815971375, "num_tokens": 843539196.0, "step": 22106 }, { "epoch": 2.812237628800407, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.47320556640625, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8673436641693115, "num_tokens": 843573767.0, "step": 22107 }, { "epoch": 2.8123648390789975, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.67018127441406, "learning_rate": 1e-06, "loss": 0.6649, "mean_token_accuracy": 0.8502591252326965, "num_tokens": 843611156.0, "step": 22108 }, { "epoch": 2.812492049357588, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.08034133911133, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8686306476593018, "num_tokens": 843644026.0, "step": 22109 }, { "epoch": 2.8126192596361785, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.84552001953125, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8671576380729675, "num_tokens": 843683477.0, "step": 22110 }, { "epoch": 2.812746469914769, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.07970428466797, "learning_rate": 1e-06, "loss": 0.6636, "mean_token_accuracy": 0.8497592210769653, "num_tokens": 843722815.0, "step": 22111 }, { "epoch": 2.8128736801933596, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.13816833496094, "learning_rate": 1e-06, "loss": 0.6295, "mean_token_accuracy": 0.8658063411712646, "num_tokens": 843763843.0, "step": 22112 }, { "epoch": 2.81300089047195, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 51.610877990722656, "learning_rate": 1e-06, "loss": 0.6499, "mean_token_accuracy": 0.8553294539451599, "num_tokens": 843805327.0, "step": 22113 }, { "epoch": 2.8131281007505406, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.339229583740234, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8633708953857422, "num_tokens": 843850132.0, "step": 22114 }, { "epoch": 2.813255311029131, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 51.821285247802734, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.864087700843811, "num_tokens": 843887425.0, "step": 22115 }, { "epoch": 2.8133825213077217, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.98392868041992, "learning_rate": 1e-06, "loss": 0.6003, "mean_token_accuracy": 0.8721248507499695, "num_tokens": 843920728.0, "step": 22116 }, { "epoch": 2.8135097315863122, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.15715789794922, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.860956072807312, "num_tokens": 843959733.0, "step": 22117 }, { "epoch": 2.8136369418649028, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.19355773925781, "learning_rate": 1e-06, "loss": 0.6254, "mean_token_accuracy": 0.8645332455635071, "num_tokens": 844005616.0, "step": 22118 }, { "epoch": 2.8137641521434933, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.27663040161133, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8785191774368286, "num_tokens": 844034216.0, "step": 22119 }, { "epoch": 2.813891362422084, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.99551010131836, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8597095012664795, "num_tokens": 844071173.0, "step": 22120 }, { "epoch": 2.8140185727006743, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.831729888916016, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8606305718421936, "num_tokens": 844113665.0, "step": 22121 }, { "epoch": 2.814145782979265, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.353965759277344, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.8624979853630066, "num_tokens": 844154063.0, "step": 22122 }, { "epoch": 2.8142729932578554, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.52777099609375, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8837598562240601, "num_tokens": 844185634.0, "step": 22123 }, { "epoch": 2.8144002035364455, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.686683654785156, "learning_rate": 1e-06, "loss": 0.649, "mean_token_accuracy": 0.8524336814880371, "num_tokens": 844221755.0, "step": 22124 }, { "epoch": 2.8145274138150365, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.382659912109375, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8686633110046387, "num_tokens": 844256230.0, "step": 22125 }, { "epoch": 2.8146546240936265, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.03275680541992, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8559786677360535, "num_tokens": 844292823.0, "step": 22126 }, { "epoch": 2.8147818343722175, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.18529510498047, "learning_rate": 1e-06, "loss": 0.6966, "mean_token_accuracy": 0.8413501381874084, "num_tokens": 844331843.0, "step": 22127 }, { "epoch": 2.8149090446508076, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.05143737792969, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8727166652679443, "num_tokens": 844375823.0, "step": 22128 }, { "epoch": 2.815036254929398, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.96588897705078, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.8635367155075073, "num_tokens": 844415187.0, "step": 22129 }, { "epoch": 2.8151634652079887, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.27104949951172, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8682264089584351, "num_tokens": 844454986.0, "step": 22130 }, { "epoch": 2.815290675486579, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.66825485229492, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8697085976600647, "num_tokens": 844495514.0, "step": 22131 }, { "epoch": 2.8154178857651697, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.546226501464844, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.863324761390686, "num_tokens": 844532406.0, "step": 22132 }, { "epoch": 2.8155450960437602, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.470245361328125, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8619527816772461, "num_tokens": 844569561.0, "step": 22133 }, { "epoch": 2.8156723063223508, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.613956451416016, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8681472539901733, "num_tokens": 844613504.0, "step": 22134 }, { "epoch": 2.8157995166009413, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.71031951904297, "learning_rate": 1e-06, "loss": 0.6349, "mean_token_accuracy": 0.8622173070907593, "num_tokens": 844649255.0, "step": 22135 }, { "epoch": 2.815926726879532, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.94227600097656, "learning_rate": 1e-06, "loss": 0.6222, "mean_token_accuracy": 0.8652464747428894, "num_tokens": 844687202.0, "step": 22136 }, { "epoch": 2.8160539371581224, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.33224868774414, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8642565608024597, "num_tokens": 844723096.0, "step": 22137 }, { "epoch": 2.816181147436713, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.606346130371094, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8635880351066589, "num_tokens": 844759754.0, "step": 22138 }, { "epoch": 2.8163083577153034, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.930885314941406, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8660538792610168, "num_tokens": 844798372.0, "step": 22139 }, { "epoch": 2.816435567993894, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.893489837646484, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8580610156059265, "num_tokens": 844832738.0, "step": 22140 }, { "epoch": 2.8165627782724845, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.1468391418457, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8645116090774536, "num_tokens": 844871684.0, "step": 22141 }, { "epoch": 2.816689988551075, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.8758430480957, "learning_rate": 1e-06, "loss": 0.6494, "mean_token_accuracy": 0.8553463816642761, "num_tokens": 844912140.0, "step": 22142 }, { "epoch": 2.8168171988296655, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.19816589355469, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8516931533813477, "num_tokens": 844948256.0, "step": 22143 }, { "epoch": 2.816944409108256, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.771812438964844, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8784804344177246, "num_tokens": 844983821.0, "step": 22144 }, { "epoch": 2.8170716193868466, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.315284729003906, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.867793083190918, "num_tokens": 845020622.0, "step": 22145 }, { "epoch": 2.817198829665437, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.58428955078125, "learning_rate": 1e-06, "loss": 0.6556, "mean_token_accuracy": 0.8535779714584351, "num_tokens": 845056748.0, "step": 22146 }, { "epoch": 2.8173260399440276, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.61979293823242, "learning_rate": 1e-06, "loss": 0.683, "mean_token_accuracy": 0.8438635468482971, "num_tokens": 845096294.0, "step": 22147 }, { "epoch": 2.817453250222618, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.25434112548828, "learning_rate": 1e-06, "loss": 0.5689, "mean_token_accuracy": 0.8785029053688049, "num_tokens": 845135132.0, "step": 22148 }, { "epoch": 2.8175804605012083, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.97639465332031, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8720215559005737, "num_tokens": 845172164.0, "step": 22149 }, { "epoch": 2.8177076707797992, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.99837875366211, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8642454147338867, "num_tokens": 845210270.0, "step": 22150 }, { "epoch": 2.8178348810583893, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.8264045715332, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8670958280563354, "num_tokens": 845251863.0, "step": 22151 }, { "epoch": 2.8179620913369803, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.53396224975586, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8693472146987915, "num_tokens": 845290262.0, "step": 22152 }, { "epoch": 2.8180893016155704, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.70212936401367, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8707736730575562, "num_tokens": 845326894.0, "step": 22153 }, { "epoch": 2.818216511894161, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.840087890625, "learning_rate": 1e-06, "loss": 0.6013, "mean_token_accuracy": 0.8743454217910767, "num_tokens": 845367545.0, "step": 22154 }, { "epoch": 2.8183437221727514, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.88080978393555, "learning_rate": 1e-06, "loss": 0.6802, "mean_token_accuracy": 0.848940372467041, "num_tokens": 845400920.0, "step": 22155 }, { "epoch": 2.818470932451342, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.75706100463867, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8604736328125, "num_tokens": 845436279.0, "step": 22156 }, { "epoch": 2.8185981427299325, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.69046401977539, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8669996857643127, "num_tokens": 845476308.0, "step": 22157 }, { "epoch": 2.818725353008523, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.605220794677734, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8594847917556763, "num_tokens": 845516624.0, "step": 22158 }, { "epoch": 2.8188525632871135, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.68745422363281, "learning_rate": 1e-06, "loss": 0.564, "mean_token_accuracy": 0.8812013864517212, "num_tokens": 845552385.0, "step": 22159 }, { "epoch": 2.818979773565704, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.43980026245117, "learning_rate": 1e-06, "loss": 0.5873, "mean_token_accuracy": 0.873010516166687, "num_tokens": 845592600.0, "step": 22160 }, { "epoch": 2.8191069838442946, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.26029968261719, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8731858730316162, "num_tokens": 845630028.0, "step": 22161 }, { "epoch": 2.819234194122885, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.41691207885742, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.8592255115509033, "num_tokens": 845665874.0, "step": 22162 }, { "epoch": 2.8193614044014756, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.683597564697266, "learning_rate": 1e-06, "loss": 0.6658, "mean_token_accuracy": 0.8515094518661499, "num_tokens": 845702738.0, "step": 22163 }, { "epoch": 2.819488614680066, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.1494026184082, "learning_rate": 1e-06, "loss": 0.6318, "mean_token_accuracy": 0.8691859245300293, "num_tokens": 845744785.0, "step": 22164 }, { "epoch": 2.8196158249586567, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.56654357910156, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8627787828445435, "num_tokens": 845779196.0, "step": 22165 }, { "epoch": 2.8197430352372472, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.556514739990234, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8524207472801208, "num_tokens": 845819016.0, "step": 22166 }, { "epoch": 2.8198702455158378, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.67335891723633, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.865067720413208, "num_tokens": 845855014.0, "step": 22167 }, { "epoch": 2.8199974557944283, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.4055061340332, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8730681538581848, "num_tokens": 845895491.0, "step": 22168 }, { "epoch": 2.820124666073019, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.94567108154297, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.8551199436187744, "num_tokens": 845931560.0, "step": 22169 }, { "epoch": 2.8202518763516093, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.300106048583984, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8653920888900757, "num_tokens": 845966701.0, "step": 22170 }, { "epoch": 2.8203790866302, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.859195709228516, "learning_rate": 1e-06, "loss": 0.5738, "mean_token_accuracy": 0.8812928795814514, "num_tokens": 846002327.0, "step": 22171 }, { "epoch": 2.82050629690879, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.14894104003906, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8639076948165894, "num_tokens": 846042157.0, "step": 22172 }, { "epoch": 2.820633507187381, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.1619758605957, "learning_rate": 1e-06, "loss": 0.6935, "mean_token_accuracy": 0.8494408130645752, "num_tokens": 846082851.0, "step": 22173 }, { "epoch": 2.820760717465971, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.01148223876953, "learning_rate": 1e-06, "loss": 0.618, "mean_token_accuracy": 0.8622641563415527, "num_tokens": 846116846.0, "step": 22174 }, { "epoch": 2.820887927744562, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.91360092163086, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8658205270767212, "num_tokens": 846154373.0, "step": 22175 }, { "epoch": 2.821015138023152, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.115482330322266, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8819444179534912, "num_tokens": 846195329.0, "step": 22176 }, { "epoch": 2.821142348301743, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.10063171386719, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8767609596252441, "num_tokens": 846233078.0, "step": 22177 }, { "epoch": 2.821269558580333, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.37895202636719, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8628033995628357, "num_tokens": 846269030.0, "step": 22178 }, { "epoch": 2.8213967688589237, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.753211975097656, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8580719232559204, "num_tokens": 846307269.0, "step": 22179 }, { "epoch": 2.821523979137514, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.491294860839844, "learning_rate": 1e-06, "loss": 0.6468, "mean_token_accuracy": 0.8532248139381409, "num_tokens": 846339116.0, "step": 22180 }, { "epoch": 2.8216511894161047, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.58714294433594, "learning_rate": 1e-06, "loss": 0.6431, "mean_token_accuracy": 0.8556044101715088, "num_tokens": 846384968.0, "step": 22181 }, { "epoch": 2.8217783996946952, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.557212829589844, "learning_rate": 1e-06, "loss": 0.6701, "mean_token_accuracy": 0.8472837209701538, "num_tokens": 846426235.0, "step": 22182 }, { "epoch": 2.8219056099732858, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.352169036865234, "learning_rate": 1e-06, "loss": 0.662, "mean_token_accuracy": 0.8503715395927429, "num_tokens": 846465715.0, "step": 22183 }, { "epoch": 2.8220328202518763, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.64775466918945, "learning_rate": 1e-06, "loss": 0.6725, "mean_token_accuracy": 0.8496531248092651, "num_tokens": 846499972.0, "step": 22184 }, { "epoch": 2.822160030530467, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.312400817871094, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8629112243652344, "num_tokens": 846535906.0, "step": 22185 }, { "epoch": 2.8222872408090574, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.36222457885742, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.8566700220108032, "num_tokens": 846572723.0, "step": 22186 }, { "epoch": 2.822414451087648, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.55451583862305, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8668169975280762, "num_tokens": 846613501.0, "step": 22187 }, { "epoch": 2.8225416613662384, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.807621002197266, "learning_rate": 1e-06, "loss": 0.5634, "mean_token_accuracy": 0.884566068649292, "num_tokens": 846650001.0, "step": 22188 }, { "epoch": 2.822668871644829, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 51.89262771606445, "learning_rate": 1e-06, "loss": 0.6593, "mean_token_accuracy": 0.8510198593139648, "num_tokens": 846690636.0, "step": 22189 }, { "epoch": 2.8227960819234195, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.42508316040039, "learning_rate": 1e-06, "loss": 0.6996, "mean_token_accuracy": 0.8410954475402832, "num_tokens": 846728513.0, "step": 22190 }, { "epoch": 2.82292329220201, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.83303451538086, "learning_rate": 1e-06, "loss": 0.6756, "mean_token_accuracy": 0.8488186597824097, "num_tokens": 846771989.0, "step": 22191 }, { "epoch": 2.8230505024806005, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.935211181640625, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8809890747070312, "num_tokens": 846808393.0, "step": 22192 }, { "epoch": 2.823177712759191, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.700069427490234, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.8536334037780762, "num_tokens": 846846928.0, "step": 22193 }, { "epoch": 2.8233049230377816, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 51.7941780090332, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8538402915000916, "num_tokens": 846884637.0, "step": 22194 }, { "epoch": 2.823432133316372, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.809471130371094, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8671851754188538, "num_tokens": 846925749.0, "step": 22195 }, { "epoch": 2.8235593435949626, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.126670837402344, "learning_rate": 1e-06, "loss": 0.6825, "mean_token_accuracy": 0.8473854064941406, "num_tokens": 846967191.0, "step": 22196 }, { "epoch": 2.8236865538735527, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.29706573486328, "learning_rate": 1e-06, "loss": 0.6057, "mean_token_accuracy": 0.8698303699493408, "num_tokens": 847003280.0, "step": 22197 }, { "epoch": 2.8238137641521437, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.147193908691406, "learning_rate": 1e-06, "loss": 0.6609, "mean_token_accuracy": 0.8490047454833984, "num_tokens": 847042179.0, "step": 22198 }, { "epoch": 2.8239409744307338, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.9924201965332, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8591727018356323, "num_tokens": 847083008.0, "step": 22199 }, { "epoch": 2.8240681847093247, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.46644973754883, "learning_rate": 1e-06, "loss": 0.6914, "mean_token_accuracy": 0.8421188592910767, "num_tokens": 847124009.0, "step": 22200 }, { "epoch": 2.824195394987915, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.59785461425781, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8583097457885742, "num_tokens": 847163526.0, "step": 22201 }, { "epoch": 2.824322605266506, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.827003479003906, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8618062734603882, "num_tokens": 847201374.0, "step": 22202 }, { "epoch": 2.824449815545096, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.20869064331055, "learning_rate": 1e-06, "loss": 0.6762, "mean_token_accuracy": 0.8483196496963501, "num_tokens": 847239469.0, "step": 22203 }, { "epoch": 2.8245770258236864, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.98312759399414, "learning_rate": 1e-06, "loss": 0.641, "mean_token_accuracy": 0.8611052632331848, "num_tokens": 847275964.0, "step": 22204 }, { "epoch": 2.824704236102277, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.212669372558594, "learning_rate": 1e-06, "loss": 0.6358, "mean_token_accuracy": 0.8641355633735657, "num_tokens": 847316815.0, "step": 22205 }, { "epoch": 2.8248314463808675, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.95660400390625, "learning_rate": 1e-06, "loss": 0.6168, "mean_token_accuracy": 0.8667239546775818, "num_tokens": 847350949.0, "step": 22206 }, { "epoch": 2.824958656659458, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.212284088134766, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8668900728225708, "num_tokens": 847388945.0, "step": 22207 }, { "epoch": 2.8250858669380485, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.86581802368164, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.8744796514511108, "num_tokens": 847435114.0, "step": 22208 }, { "epoch": 2.825213077216639, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.372215270996094, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.8550121784210205, "num_tokens": 847470815.0, "step": 22209 }, { "epoch": 2.8253402874952296, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.047794342041016, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8610550165176392, "num_tokens": 847509379.0, "step": 22210 }, { "epoch": 2.82546749777382, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.015926361083984, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8659622669219971, "num_tokens": 847547427.0, "step": 22211 }, { "epoch": 2.8255947080524106, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.210391998291016, "learning_rate": 1e-06, "loss": 0.6987, "mean_token_accuracy": 0.8451564311981201, "num_tokens": 847584740.0, "step": 22212 }, { "epoch": 2.825721918331001, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.4647102355957, "learning_rate": 1e-06, "loss": 0.6564, "mean_token_accuracy": 0.8513861894607544, "num_tokens": 847626201.0, "step": 22213 }, { "epoch": 2.8258491286095917, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.10304641723633, "learning_rate": 1e-06, "loss": 0.5818, "mean_token_accuracy": 0.8766686916351318, "num_tokens": 847661311.0, "step": 22214 }, { "epoch": 2.8259763388881822, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.25545883178711, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8610838651657104, "num_tokens": 847695234.0, "step": 22215 }, { "epoch": 2.8261035491667728, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.82035446166992, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.881288468837738, "num_tokens": 847738848.0, "step": 22216 }, { "epoch": 2.8262307594453633, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.447837829589844, "learning_rate": 1e-06, "loss": 0.6053, "mean_token_accuracy": 0.8662511110305786, "num_tokens": 847775952.0, "step": 22217 }, { "epoch": 2.826357969723954, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.20285415649414, "learning_rate": 1e-06, "loss": 0.5767, "mean_token_accuracy": 0.8799055814743042, "num_tokens": 847809381.0, "step": 22218 }, { "epoch": 2.8264851800025443, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.32896423339844, "learning_rate": 1e-06, "loss": 0.637, "mean_token_accuracy": 0.8603678941726685, "num_tokens": 847849835.0, "step": 22219 }, { "epoch": 2.826612390281135, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.098323822021484, "learning_rate": 1e-06, "loss": 0.6252, "mean_token_accuracy": 0.859902560710907, "num_tokens": 847882726.0, "step": 22220 }, { "epoch": 2.8267396005597254, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 51.85751724243164, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.863447904586792, "num_tokens": 847920663.0, "step": 22221 }, { "epoch": 2.8268668108383155, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.63974380493164, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.869958758354187, "num_tokens": 847962204.0, "step": 22222 }, { "epoch": 2.8269940211169065, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.09309005737305, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8548168540000916, "num_tokens": 848003722.0, "step": 22223 }, { "epoch": 2.8271212313954965, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.6673583984375, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8706437945365906, "num_tokens": 848045335.0, "step": 22224 }, { "epoch": 2.8272484416740875, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.159908294677734, "learning_rate": 1e-06, "loss": 0.6448, "mean_token_accuracy": 0.8554222583770752, "num_tokens": 848088817.0, "step": 22225 }, { "epoch": 2.8273756519526776, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.76582336425781, "learning_rate": 1e-06, "loss": 0.6601, "mean_token_accuracy": 0.8521231412887573, "num_tokens": 848120993.0, "step": 22226 }, { "epoch": 2.827502862231268, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.92506408691406, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8539350628852844, "num_tokens": 848162542.0, "step": 22227 }, { "epoch": 2.8276300725098586, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.697898864746094, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8746130466461182, "num_tokens": 848198194.0, "step": 22228 }, { "epoch": 2.827757282788449, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.72787857055664, "learning_rate": 1e-06, "loss": 0.6443, "mean_token_accuracy": 0.8576876521110535, "num_tokens": 848240575.0, "step": 22229 }, { "epoch": 2.8278844930670397, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 54.08135986328125, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8634532690048218, "num_tokens": 848289026.0, "step": 22230 }, { "epoch": 2.8280117033456302, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.62944793701172, "learning_rate": 1e-06, "loss": 0.6062, "mean_token_accuracy": 0.8642963171005249, "num_tokens": 848326438.0, "step": 22231 }, { "epoch": 2.8281389136242208, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.70878982543945, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8702290654182434, "num_tokens": 848368106.0, "step": 22232 }, { "epoch": 2.8282661239028113, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.592247009277344, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.8598165512084961, "num_tokens": 848409775.0, "step": 22233 }, { "epoch": 2.828393334181402, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.488189697265625, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.8645235300064087, "num_tokens": 848442149.0, "step": 22234 }, { "epoch": 2.8285205444599923, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.4628791809082, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8578090071678162, "num_tokens": 848475468.0, "step": 22235 }, { "epoch": 2.828647754738583, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.53493881225586, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8630526661872864, "num_tokens": 848509565.0, "step": 22236 }, { "epoch": 2.8287749650171734, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.60940933227539, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8638722896575928, "num_tokens": 848546357.0, "step": 22237 }, { "epoch": 2.828902175295764, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.36256790161133, "learning_rate": 1e-06, "loss": 0.6556, "mean_token_accuracy": 0.8516581058502197, "num_tokens": 848587661.0, "step": 22238 }, { "epoch": 2.8290293855743545, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.51613235473633, "learning_rate": 1e-06, "loss": 0.5762, "mean_token_accuracy": 0.8677133321762085, "num_tokens": 848628250.0, "step": 22239 }, { "epoch": 2.829156595852945, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.94923400878906, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.863957405090332, "num_tokens": 848670452.0, "step": 22240 }, { "epoch": 2.8292838061315355, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.64963912963867, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8645435571670532, "num_tokens": 848703592.0, "step": 22241 }, { "epoch": 2.829411016410126, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.518741607666016, "learning_rate": 1e-06, "loss": 0.6801, "mean_token_accuracy": 0.8465058207511902, "num_tokens": 848739291.0, "step": 22242 }, { "epoch": 2.8295382266887166, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.15945816040039, "learning_rate": 1e-06, "loss": 0.6402, "mean_token_accuracy": 0.8568511009216309, "num_tokens": 848776039.0, "step": 22243 }, { "epoch": 2.829665436967307, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.891754150390625, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8662453889846802, "num_tokens": 848814799.0, "step": 22244 }, { "epoch": 2.8297926472458976, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.24204635620117, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8648865222930908, "num_tokens": 848860123.0, "step": 22245 }, { "epoch": 2.829919857524488, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.592681884765625, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.880040168762207, "num_tokens": 848900447.0, "step": 22246 }, { "epoch": 2.8300470678030782, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.409934997558594, "learning_rate": 1e-06, "loss": 0.6541, "mean_token_accuracy": 0.8557913303375244, "num_tokens": 848945385.0, "step": 22247 }, { "epoch": 2.830174278081669, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.683502197265625, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8526812791824341, "num_tokens": 848978783.0, "step": 22248 }, { "epoch": 2.8303014883602593, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.887535095214844, "learning_rate": 1e-06, "loss": 0.6681, "mean_token_accuracy": 0.8515912890434265, "num_tokens": 849017324.0, "step": 22249 }, { "epoch": 2.8304286986388503, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.17949295043945, "learning_rate": 1e-06, "loss": 0.5664, "mean_token_accuracy": 0.884955883026123, "num_tokens": 849045928.0, "step": 22250 }, { "epoch": 2.8305559089174404, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.934810638427734, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8628158569335938, "num_tokens": 849080855.0, "step": 22251 }, { "epoch": 2.830683119196031, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.16663360595703, "learning_rate": 1e-06, "loss": 0.619, "mean_token_accuracy": 0.8661381006240845, "num_tokens": 849114835.0, "step": 22252 }, { "epoch": 2.8308103294746214, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.84688186645508, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8519734144210815, "num_tokens": 849152632.0, "step": 22253 }, { "epoch": 2.830937539753212, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.19207000732422, "learning_rate": 1e-06, "loss": 0.6051, "mean_token_accuracy": 0.8712631464004517, "num_tokens": 849189911.0, "step": 22254 }, { "epoch": 2.8310647500318025, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.01470947265625, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8591271042823792, "num_tokens": 849227847.0, "step": 22255 }, { "epoch": 2.831191960310393, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.70941162109375, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8682452440261841, "num_tokens": 849265778.0, "step": 22256 }, { "epoch": 2.8313191705889835, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.38340377807617, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8623371124267578, "num_tokens": 849302488.0, "step": 22257 }, { "epoch": 2.831446380867574, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.4097785949707, "learning_rate": 1e-06, "loss": 0.5666, "mean_token_accuracy": 0.8784187436103821, "num_tokens": 849337483.0, "step": 22258 }, { "epoch": 2.8315735911461646, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.523162841796875, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8675369620323181, "num_tokens": 849378491.0, "step": 22259 }, { "epoch": 2.831700801424755, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.542232513427734, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8691930174827576, "num_tokens": 849418897.0, "step": 22260 }, { "epoch": 2.8318280117033456, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.07095718383789, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8663739562034607, "num_tokens": 849457057.0, "step": 22261 }, { "epoch": 2.831955221981936, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.905452728271484, "learning_rate": 1e-06, "loss": 0.6016, "mean_token_accuracy": 0.8675634860992432, "num_tokens": 849488825.0, "step": 22262 }, { "epoch": 2.8320824322605267, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.746971130371094, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8718097805976868, "num_tokens": 849528336.0, "step": 22263 }, { "epoch": 2.8322096425391172, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.9140510559082, "learning_rate": 1e-06, "loss": 0.5821, "mean_token_accuracy": 0.8763977885246277, "num_tokens": 849565373.0, "step": 22264 }, { "epoch": 2.8323368528177078, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.47924041748047, "learning_rate": 1e-06, "loss": 0.5752, "mean_token_accuracy": 0.8744380474090576, "num_tokens": 849600909.0, "step": 22265 }, { "epoch": 2.8324640630962983, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.61930465698242, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8545684814453125, "num_tokens": 849639499.0, "step": 22266 }, { "epoch": 2.832591273374889, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.56468200683594, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8735319972038269, "num_tokens": 849677232.0, "step": 22267 }, { "epoch": 2.8327184836534793, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.127685546875, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8694222569465637, "num_tokens": 849717187.0, "step": 22268 }, { "epoch": 2.83284569393207, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.16575241088867, "learning_rate": 1e-06, "loss": 0.5928, "mean_token_accuracy": 0.8726081848144531, "num_tokens": 849756545.0, "step": 22269 }, { "epoch": 2.83297290421066, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.24552536010742, "learning_rate": 1e-06, "loss": 0.5871, "mean_token_accuracy": 0.8649551868438721, "num_tokens": 849788670.0, "step": 22270 }, { "epoch": 2.833100114489251, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.96987533569336, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.8638527393341064, "num_tokens": 849825715.0, "step": 22271 }, { "epoch": 2.833227324767841, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.685367584228516, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8692159056663513, "num_tokens": 849856435.0, "step": 22272 }, { "epoch": 2.833354535046432, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.743927001953125, "learning_rate": 1e-06, "loss": 0.5785, "mean_token_accuracy": 0.8773223161697388, "num_tokens": 849895634.0, "step": 22273 }, { "epoch": 2.833481745325022, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.03962707519531, "learning_rate": 1e-06, "loss": 0.6883, "mean_token_accuracy": 0.8425931334495544, "num_tokens": 849938646.0, "step": 22274 }, { "epoch": 2.833608955603613, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.1824951171875, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.8623652458190918, "num_tokens": 849978642.0, "step": 22275 }, { "epoch": 2.833736165882203, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.33794403076172, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8625525236129761, "num_tokens": 850017820.0, "step": 22276 }, { "epoch": 2.8338633761607936, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.38542556762695, "learning_rate": 1e-06, "loss": 0.6799, "mean_token_accuracy": 0.8460297584533691, "num_tokens": 850056619.0, "step": 22277 }, { "epoch": 2.833990586439384, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.412574768066406, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8606544733047485, "num_tokens": 850092456.0, "step": 22278 }, { "epoch": 2.8341177967179747, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.12620544433594, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8602029085159302, "num_tokens": 850131598.0, "step": 22279 }, { "epoch": 2.8342450069965652, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.973392486572266, "learning_rate": 1e-06, "loss": 0.6605, "mean_token_accuracy": 0.8529557585716248, "num_tokens": 850170884.0, "step": 22280 }, { "epoch": 2.8343722172751558, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.47951126098633, "learning_rate": 1e-06, "loss": 0.62, "mean_token_accuracy": 0.8614448308944702, "num_tokens": 850206989.0, "step": 22281 }, { "epoch": 2.8344994275537463, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.29792022705078, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8593011498451233, "num_tokens": 850248602.0, "step": 22282 }, { "epoch": 2.834626637832337, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.496826171875, "learning_rate": 1e-06, "loss": 0.6452, "mean_token_accuracy": 0.853376030921936, "num_tokens": 850289790.0, "step": 22283 }, { "epoch": 2.8347538481109273, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.09653091430664, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8710155487060547, "num_tokens": 850326688.0, "step": 22284 }, { "epoch": 2.834881058389518, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.83440399169922, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8756681084632874, "num_tokens": 850366460.0, "step": 22285 }, { "epoch": 2.8350082686681084, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.01361846923828, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8713701963424683, "num_tokens": 850400046.0, "step": 22286 }, { "epoch": 2.835135478946699, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.36724090576172, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8529164791107178, "num_tokens": 850434374.0, "step": 22287 }, { "epoch": 2.8352626892252895, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.179603576660156, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8663455843925476, "num_tokens": 850469056.0, "step": 22288 }, { "epoch": 2.83538989950388, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.20933532714844, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.8610072135925293, "num_tokens": 850505805.0, "step": 22289 }, { "epoch": 2.8355171097824705, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.40767288208008, "learning_rate": 1e-06, "loss": 0.6714, "mean_token_accuracy": 0.8514301776885986, "num_tokens": 850550056.0, "step": 22290 }, { "epoch": 2.835644320061061, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.436431884765625, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8722025156021118, "num_tokens": 850585075.0, "step": 22291 }, { "epoch": 2.8357715303396516, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.92864990234375, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8740984201431274, "num_tokens": 850625758.0, "step": 22292 }, { "epoch": 2.835898740618242, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.789058685302734, "learning_rate": 1e-06, "loss": 0.5675, "mean_token_accuracy": 0.8762584924697876, "num_tokens": 850664608.0, "step": 22293 }, { "epoch": 2.8360259508968326, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.77510070800781, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.872607409954071, "num_tokens": 850705897.0, "step": 22294 }, { "epoch": 2.8361531611754227, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.9569206237793, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8586958646774292, "num_tokens": 850746195.0, "step": 22295 }, { "epoch": 2.8362803714540137, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.46859359741211, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8700430989265442, "num_tokens": 850785183.0, "step": 22296 }, { "epoch": 2.8364075817326038, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.33975601196289, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8689053058624268, "num_tokens": 850819558.0, "step": 22297 }, { "epoch": 2.8365347920111947, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.537593841552734, "learning_rate": 1e-06, "loss": 0.6485, "mean_token_accuracy": 0.85496985912323, "num_tokens": 850857385.0, "step": 22298 }, { "epoch": 2.836662002289785, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.091278076171875, "learning_rate": 1e-06, "loss": 0.6476, "mean_token_accuracy": 0.8580933809280396, "num_tokens": 850895297.0, "step": 22299 }, { "epoch": 2.836789212568376, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.829734802246094, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8625640869140625, "num_tokens": 850930982.0, "step": 22300 }, { "epoch": 2.836916422846966, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.58940505981445, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8598405122756958, "num_tokens": 850969032.0, "step": 22301 }, { "epoch": 2.8370436331255564, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.75142288208008, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.86655592918396, "num_tokens": 851008325.0, "step": 22302 }, { "epoch": 2.837170843404147, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.32258987426758, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8661819696426392, "num_tokens": 851044289.0, "step": 22303 }, { "epoch": 2.8372980536827375, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.92359924316406, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8704252243041992, "num_tokens": 851079887.0, "step": 22304 }, { "epoch": 2.837425263961328, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.77192687988281, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8676482439041138, "num_tokens": 851124535.0, "step": 22305 }, { "epoch": 2.8375524742399185, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.51428985595703, "learning_rate": 1e-06, "loss": 0.5862, "mean_token_accuracy": 0.8735641241073608, "num_tokens": 851162982.0, "step": 22306 }, { "epoch": 2.837679684518509, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.148521423339844, "learning_rate": 1e-06, "loss": 0.5839, "mean_token_accuracy": 0.8728684186935425, "num_tokens": 851201864.0, "step": 22307 }, { "epoch": 2.8378068947970996, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.73896789550781, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.8667507171630859, "num_tokens": 851241659.0, "step": 22308 }, { "epoch": 2.83793410507569, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.06672286987305, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.855133056640625, "num_tokens": 851278861.0, "step": 22309 }, { "epoch": 2.8380613153542806, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.983707427978516, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8648250102996826, "num_tokens": 851314442.0, "step": 22310 }, { "epoch": 2.838188525632871, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.85206604003906, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8581966161727905, "num_tokens": 851356322.0, "step": 22311 }, { "epoch": 2.8383157359114617, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.35159683227539, "learning_rate": 1e-06, "loss": 0.6595, "mean_token_accuracy": 0.8527634739875793, "num_tokens": 851395374.0, "step": 22312 }, { "epoch": 2.838442946190052, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.4437370300293, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.86252760887146, "num_tokens": 851430957.0, "step": 22313 }, { "epoch": 2.8385701564686427, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.18768310546875, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8594557046890259, "num_tokens": 851474357.0, "step": 22314 }, { "epoch": 2.8386973667472333, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.485713958740234, "learning_rate": 1e-06, "loss": 0.573, "mean_token_accuracy": 0.8766827583312988, "num_tokens": 851510346.0, "step": 22315 }, { "epoch": 2.838824577025824, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.563133239746094, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8662585616111755, "num_tokens": 851546798.0, "step": 22316 }, { "epoch": 2.8389517873044143, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.74172592163086, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8664419651031494, "num_tokens": 851587819.0, "step": 22317 }, { "epoch": 2.839078997583005, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.145259857177734, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.867924690246582, "num_tokens": 851628764.0, "step": 22318 }, { "epoch": 2.8392062078615954, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.8136100769043, "learning_rate": 1e-06, "loss": 0.6643, "mean_token_accuracy": 0.8512410521507263, "num_tokens": 851672206.0, "step": 22319 }, { "epoch": 2.8393334181401855, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.3424186706543, "learning_rate": 1e-06, "loss": 0.6431, "mean_token_accuracy": 0.8585377931594849, "num_tokens": 851709969.0, "step": 22320 }, { "epoch": 2.8394606284187764, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.44993591308594, "learning_rate": 1e-06, "loss": 0.6503, "mean_token_accuracy": 0.8570153117179871, "num_tokens": 851749015.0, "step": 22321 }, { "epoch": 2.8395878386973665, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.54651641845703, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8687624931335449, "num_tokens": 851787210.0, "step": 22322 }, { "epoch": 2.8397150489759575, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.244205474853516, "learning_rate": 1e-06, "loss": 0.6368, "mean_token_accuracy": 0.8573755621910095, "num_tokens": 851824677.0, "step": 22323 }, { "epoch": 2.8398422592545476, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.83717727661133, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8698310852050781, "num_tokens": 851863614.0, "step": 22324 }, { "epoch": 2.839969469533138, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.349143981933594, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8570338487625122, "num_tokens": 851908153.0, "step": 22325 }, { "epoch": 2.8400966798117286, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.44906997680664, "learning_rate": 1e-06, "loss": 0.644, "mean_token_accuracy": 0.8608230352401733, "num_tokens": 851946473.0, "step": 22326 }, { "epoch": 2.840223890090319, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.79393768310547, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.8522974848747253, "num_tokens": 851991600.0, "step": 22327 }, { "epoch": 2.8403511003689097, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.64979934692383, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8654875755310059, "num_tokens": 852029468.0, "step": 22328 }, { "epoch": 2.8404783106475002, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.120697021484375, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8685998916625977, "num_tokens": 852061400.0, "step": 22329 }, { "epoch": 2.8406055209260908, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.540245056152344, "learning_rate": 1e-06, "loss": 0.6354, "mean_token_accuracy": 0.8632384538650513, "num_tokens": 852100596.0, "step": 22330 }, { "epoch": 2.8407327312046813, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.11383056640625, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8661131858825684, "num_tokens": 852140320.0, "step": 22331 }, { "epoch": 2.840859941483272, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.658485412597656, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8625741004943848, "num_tokens": 852175969.0, "step": 22332 }, { "epoch": 2.8409871517618623, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.85294723510742, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8485965728759766, "num_tokens": 852210816.0, "step": 22333 }, { "epoch": 2.841114362040453, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.50872802734375, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8625507354736328, "num_tokens": 852246684.0, "step": 22334 }, { "epoch": 2.8412415723190434, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.25481033325195, "learning_rate": 1e-06, "loss": 0.6274, "mean_token_accuracy": 0.8665807843208313, "num_tokens": 852285283.0, "step": 22335 }, { "epoch": 2.841368782597634, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.67433166503906, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8712487816810608, "num_tokens": 852323310.0, "step": 22336 }, { "epoch": 2.8414959928762245, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.96269607543945, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8666690587997437, "num_tokens": 852360850.0, "step": 22337 }, { "epoch": 2.841623203154815, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.99720001220703, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8565435409545898, "num_tokens": 852399320.0, "step": 22338 }, { "epoch": 2.8417504134334055, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.98236083984375, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8603348731994629, "num_tokens": 852430980.0, "step": 22339 }, { "epoch": 2.841877623711996, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.104984283447266, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8744442462921143, "num_tokens": 852465688.0, "step": 22340 }, { "epoch": 2.8420048339905866, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.26862716674805, "learning_rate": 1e-06, "loss": 0.6291, "mean_token_accuracy": 0.8608047962188721, "num_tokens": 852504372.0, "step": 22341 }, { "epoch": 2.842132044269177, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.06951904296875, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8687714338302612, "num_tokens": 852542232.0, "step": 22342 }, { "epoch": 2.8422592545477676, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.58578109741211, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.8546528816223145, "num_tokens": 852580005.0, "step": 22343 }, { "epoch": 2.842386464826358, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.23751449584961, "learning_rate": 1e-06, "loss": 0.5826, "mean_token_accuracy": 0.8758941888809204, "num_tokens": 852619912.0, "step": 22344 }, { "epoch": 2.8425136751049482, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.483890533447266, "learning_rate": 1e-06, "loss": 0.6079, "mean_token_accuracy": 0.865193784236908, "num_tokens": 852654349.0, "step": 22345 }, { "epoch": 2.842640885383539, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.4621467590332, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8561628460884094, "num_tokens": 852691300.0, "step": 22346 }, { "epoch": 2.8427680956621293, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.57475662231445, "learning_rate": 1e-06, "loss": 0.6206, "mean_token_accuracy": 0.8633238077163696, "num_tokens": 852731228.0, "step": 22347 }, { "epoch": 2.8428953059407203, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.63292694091797, "learning_rate": 1e-06, "loss": 0.6224, "mean_token_accuracy": 0.8623349666595459, "num_tokens": 852768333.0, "step": 22348 }, { "epoch": 2.8430225162193103, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.317630767822266, "learning_rate": 1e-06, "loss": 0.7234, "mean_token_accuracy": 0.8314077854156494, "num_tokens": 852810962.0, "step": 22349 }, { "epoch": 2.843149726497901, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.168739318847656, "learning_rate": 1e-06, "loss": 0.5863, "mean_token_accuracy": 0.8776862621307373, "num_tokens": 852844326.0, "step": 22350 }, { "epoch": 2.8432769367764914, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.10689163208008, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.857201099395752, "num_tokens": 852877737.0, "step": 22351 }, { "epoch": 2.843404147055082, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.377159118652344, "learning_rate": 1e-06, "loss": 0.5946, "mean_token_accuracy": 0.8730245232582092, "num_tokens": 852922085.0, "step": 22352 }, { "epoch": 2.8435313573336725, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.55412673950195, "learning_rate": 1e-06, "loss": 0.5699, "mean_token_accuracy": 0.8800423741340637, "num_tokens": 852960742.0, "step": 22353 }, { "epoch": 2.843658567612263, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.2653694152832, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.867945671081543, "num_tokens": 852993502.0, "step": 22354 }, { "epoch": 2.8437857778908535, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.15575408935547, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8695761561393738, "num_tokens": 853036844.0, "step": 22355 }, { "epoch": 2.843912988169444, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.55070877075195, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8625643253326416, "num_tokens": 853070851.0, "step": 22356 }, { "epoch": 2.8440401984480346, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.8720588684082, "learning_rate": 1e-06, "loss": 0.6355, "mean_token_accuracy": 0.8621030449867249, "num_tokens": 853106116.0, "step": 22357 }, { "epoch": 2.844167408726625, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.736385345458984, "learning_rate": 1e-06, "loss": 0.6258, "mean_token_accuracy": 0.8616979718208313, "num_tokens": 853152776.0, "step": 22358 }, { "epoch": 2.8442946190052156, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.735050201416016, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8620434403419495, "num_tokens": 853194176.0, "step": 22359 }, { "epoch": 2.844421829283806, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.44026184082031, "learning_rate": 1e-06, "loss": 0.6355, "mean_token_accuracy": 0.8607752919197083, "num_tokens": 853236834.0, "step": 22360 }, { "epoch": 2.8445490395623967, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.025489807128906, "learning_rate": 1e-06, "loss": 0.6554, "mean_token_accuracy": 0.8546401262283325, "num_tokens": 853274145.0, "step": 22361 }, { "epoch": 2.844676249840987, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.83013153076172, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8650509119033813, "num_tokens": 853312569.0, "step": 22362 }, { "epoch": 2.8448034601195777, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.973026275634766, "learning_rate": 1e-06, "loss": 0.6026, "mean_token_accuracy": 0.8682562112808228, "num_tokens": 853351742.0, "step": 22363 }, { "epoch": 2.8449306703981683, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.3919792175293, "learning_rate": 1e-06, "loss": 0.6191, "mean_token_accuracy": 0.8638178706169128, "num_tokens": 853389607.0, "step": 22364 }, { "epoch": 2.845057880676759, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.48886489868164, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8704469799995422, "num_tokens": 853427218.0, "step": 22365 }, { "epoch": 2.8451850909553493, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.17768859863281, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8677374720573425, "num_tokens": 853464773.0, "step": 22366 }, { "epoch": 2.84531230123394, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.79096603393555, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8642871379852295, "num_tokens": 853507250.0, "step": 22367 }, { "epoch": 2.84543951151253, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.046634674072266, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8658187389373779, "num_tokens": 853551947.0, "step": 22368 }, { "epoch": 2.845566721791121, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.069007873535156, "learning_rate": 1e-06, "loss": 0.6541, "mean_token_accuracy": 0.8565548062324524, "num_tokens": 853590557.0, "step": 22369 }, { "epoch": 2.845693932069711, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.66114044189453, "learning_rate": 1e-06, "loss": 0.6464, "mean_token_accuracy": 0.8591436147689819, "num_tokens": 853630870.0, "step": 22370 }, { "epoch": 2.845821142348302, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.28561019897461, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.861482560634613, "num_tokens": 853662360.0, "step": 22371 }, { "epoch": 2.845948352626892, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.356693267822266, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8574920892715454, "num_tokens": 853703661.0, "step": 22372 }, { "epoch": 2.846075562905483, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.09969711303711, "learning_rate": 1e-06, "loss": 0.6767, "mean_token_accuracy": 0.8515282869338989, "num_tokens": 853742398.0, "step": 22373 }, { "epoch": 2.846202773184073, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.53840255737305, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8706042766571045, "num_tokens": 853781479.0, "step": 22374 }, { "epoch": 2.8463299834626636, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.159690856933594, "learning_rate": 1e-06, "loss": 0.7075, "mean_token_accuracy": 0.838601291179657, "num_tokens": 853822330.0, "step": 22375 }, { "epoch": 2.846457193741254, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.41410827636719, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8501431941986084, "num_tokens": 853862747.0, "step": 22376 }, { "epoch": 2.8465844040198447, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.98379898071289, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8875837922096252, "num_tokens": 853897124.0, "step": 22377 }, { "epoch": 2.8467116142984352, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.71072006225586, "learning_rate": 1e-06, "loss": 0.677, "mean_token_accuracy": 0.8495457768440247, "num_tokens": 853930516.0, "step": 22378 }, { "epoch": 2.8468388245770258, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.088321685791016, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8685528039932251, "num_tokens": 853967016.0, "step": 22379 }, { "epoch": 2.8469660348556163, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.73662567138672, "learning_rate": 1e-06, "loss": 0.6688, "mean_token_accuracy": 0.8476489186286926, "num_tokens": 854008804.0, "step": 22380 }, { "epoch": 2.847093245134207, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.098350524902344, "learning_rate": 1e-06, "loss": 0.6742, "mean_token_accuracy": 0.850831151008606, "num_tokens": 854048393.0, "step": 22381 }, { "epoch": 2.8472204554127973, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.69709396362305, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8687632083892822, "num_tokens": 854089480.0, "step": 22382 }, { "epoch": 2.847347665691388, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.929107666015625, "learning_rate": 1e-06, "loss": 0.5915, "mean_token_accuracy": 0.8734805583953857, "num_tokens": 854125677.0, "step": 22383 }, { "epoch": 2.8474748759699784, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.69202423095703, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8594388961791992, "num_tokens": 854160485.0, "step": 22384 }, { "epoch": 2.847602086248569, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.79970932006836, "learning_rate": 1e-06, "loss": 0.6616, "mean_token_accuracy": 0.855668306350708, "num_tokens": 854200635.0, "step": 22385 }, { "epoch": 2.8477292965271594, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.250892639160156, "learning_rate": 1e-06, "loss": 0.6142, "mean_token_accuracy": 0.8677254915237427, "num_tokens": 854240046.0, "step": 22386 }, { "epoch": 2.84785650680575, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.781341552734375, "learning_rate": 1e-06, "loss": 0.592, "mean_token_accuracy": 0.8736796379089355, "num_tokens": 854275880.0, "step": 22387 }, { "epoch": 2.8479837170843405, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.82779312133789, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8643657565116882, "num_tokens": 854311204.0, "step": 22388 }, { "epoch": 2.848110927362931, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.13154983520508, "learning_rate": 1e-06, "loss": 0.6331, "mean_token_accuracy": 0.8614421486854553, "num_tokens": 854346076.0, "step": 22389 }, { "epoch": 2.8482381376415216, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.288047790527344, "learning_rate": 1e-06, "loss": 0.5976, "mean_token_accuracy": 0.8714842796325684, "num_tokens": 854383075.0, "step": 22390 }, { "epoch": 2.848365347920112, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.272125244140625, "learning_rate": 1e-06, "loss": 0.6654, "mean_token_accuracy": 0.8549792170524597, "num_tokens": 854416011.0, "step": 22391 }, { "epoch": 2.8484925581987026, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.432987213134766, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.865954577922821, "num_tokens": 854449520.0, "step": 22392 }, { "epoch": 2.8486197684772927, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.81243133544922, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8619229793548584, "num_tokens": 854484355.0, "step": 22393 }, { "epoch": 2.8487469787558837, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.96086883544922, "learning_rate": 1e-06, "loss": 0.6781, "mean_token_accuracy": 0.8486023545265198, "num_tokens": 854526238.0, "step": 22394 }, { "epoch": 2.8488741890344738, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.24875259399414, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.8660259246826172, "num_tokens": 854562619.0, "step": 22395 }, { "epoch": 2.8490013993130647, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.168434143066406, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8685680031776428, "num_tokens": 854604974.0, "step": 22396 }, { "epoch": 2.849128609591655, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.07011413574219, "learning_rate": 1e-06, "loss": 0.5988, "mean_token_accuracy": 0.8713012337684631, "num_tokens": 854644066.0, "step": 22397 }, { "epoch": 2.849255819870246, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.286678314208984, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8664486408233643, "num_tokens": 854682723.0, "step": 22398 }, { "epoch": 2.849383030148836, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.33552932739258, "learning_rate": 1e-06, "loss": 0.5914, "mean_token_accuracy": 0.8742841482162476, "num_tokens": 854720292.0, "step": 22399 }, { "epoch": 2.8495102404274264, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.079166412353516, "learning_rate": 1e-06, "loss": 0.6522, "mean_token_accuracy": 0.8607221841812134, "num_tokens": 854756917.0, "step": 22400 }, { "epoch": 2.849637450706017, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.92635726928711, "learning_rate": 1e-06, "loss": 0.6257, "mean_token_accuracy": 0.8641091585159302, "num_tokens": 854798771.0, "step": 22401 }, { "epoch": 2.8497646609846075, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.11159133911133, "learning_rate": 1e-06, "loss": 0.6194, "mean_token_accuracy": 0.8617150783538818, "num_tokens": 854838193.0, "step": 22402 }, { "epoch": 2.849891871263198, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.546409606933594, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8576894998550415, "num_tokens": 854871650.0, "step": 22403 }, { "epoch": 2.8500190815417885, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.02334213256836, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8624341487884521, "num_tokens": 854910757.0, "step": 22404 }, { "epoch": 2.850146291820379, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.42341995239258, "learning_rate": 1e-06, "loss": 0.6606, "mean_token_accuracy": 0.8506761789321899, "num_tokens": 854945018.0, "step": 22405 }, { "epoch": 2.8502735020989696, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.57213592529297, "learning_rate": 1e-06, "loss": 0.6386, "mean_token_accuracy": 0.8560205698013306, "num_tokens": 854988369.0, "step": 22406 }, { "epoch": 2.85040071237756, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.730010986328125, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8644027709960938, "num_tokens": 855023935.0, "step": 22407 }, { "epoch": 2.8505279226561506, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.8314094543457, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8638885021209717, "num_tokens": 855062002.0, "step": 22408 }, { "epoch": 2.850655132934741, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.35198211669922, "learning_rate": 1e-06, "loss": 0.6181, "mean_token_accuracy": 0.8658177852630615, "num_tokens": 855101257.0, "step": 22409 }, { "epoch": 2.8507823432133317, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.936302185058594, "learning_rate": 1e-06, "loss": 0.6239, "mean_token_accuracy": 0.8645862936973572, "num_tokens": 855135834.0, "step": 22410 }, { "epoch": 2.850909553491922, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.21674346923828, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8639086484909058, "num_tokens": 855174156.0, "step": 22411 }, { "epoch": 2.8510367637705127, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.80146026611328, "learning_rate": 1e-06, "loss": 0.5972, "mean_token_accuracy": 0.8734332323074341, "num_tokens": 855207698.0, "step": 22412 }, { "epoch": 2.8511639740491033, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.38004684448242, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8625009059906006, "num_tokens": 855240636.0, "step": 22413 }, { "epoch": 2.851291184327694, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.72526931762695, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8660602569580078, "num_tokens": 855279923.0, "step": 22414 }, { "epoch": 2.8514183946062843, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.09982681274414, "learning_rate": 1e-06, "loss": 0.6603, "mean_token_accuracy": 0.8515729904174805, "num_tokens": 855315460.0, "step": 22415 }, { "epoch": 2.851545604884875, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.87650680541992, "learning_rate": 1e-06, "loss": 0.5891, "mean_token_accuracy": 0.8720617294311523, "num_tokens": 855351548.0, "step": 22416 }, { "epoch": 2.8516728151634654, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.557254791259766, "learning_rate": 1e-06, "loss": 0.6839, "mean_token_accuracy": 0.8445498943328857, "num_tokens": 855390715.0, "step": 22417 }, { "epoch": 2.8518000254420555, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.38753890991211, "learning_rate": 1e-06, "loss": 0.6928, "mean_token_accuracy": 0.84987473487854, "num_tokens": 855431997.0, "step": 22418 }, { "epoch": 2.8519272357206464, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.571067810058594, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.8580725193023682, "num_tokens": 855470242.0, "step": 22419 }, { "epoch": 2.8520544459992365, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.44101333618164, "learning_rate": 1e-06, "loss": 0.6597, "mean_token_accuracy": 0.8497922420501709, "num_tokens": 855508901.0, "step": 22420 }, { "epoch": 2.8521816562778275, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.23598861694336, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8608303070068359, "num_tokens": 855545290.0, "step": 22421 }, { "epoch": 2.8523088665564176, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.48270797729492, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.858529806137085, "num_tokens": 855579567.0, "step": 22422 }, { "epoch": 2.852436076835008, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.924583435058594, "learning_rate": 1e-06, "loss": 0.6493, "mean_token_accuracy": 0.856753408908844, "num_tokens": 855621117.0, "step": 22423 }, { "epoch": 2.8525632871135986, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.042701721191406, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8684653043746948, "num_tokens": 855660494.0, "step": 22424 }, { "epoch": 2.852690497392189, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.49456024169922, "learning_rate": 1e-06, "loss": 0.6434, "mean_token_accuracy": 0.8613183498382568, "num_tokens": 855694043.0, "step": 22425 }, { "epoch": 2.8528177076707797, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.06093978881836, "learning_rate": 1e-06, "loss": 0.6599, "mean_token_accuracy": 0.8486778736114502, "num_tokens": 855734615.0, "step": 22426 }, { "epoch": 2.85294491794937, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.31222915649414, "learning_rate": 1e-06, "loss": 0.5952, "mean_token_accuracy": 0.8740253448486328, "num_tokens": 855771141.0, "step": 22427 }, { "epoch": 2.8530721282279607, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.440006256103516, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8619056344032288, "num_tokens": 855806877.0, "step": 22428 }, { "epoch": 2.8531993385065513, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.301387786865234, "learning_rate": 1e-06, "loss": 0.5845, "mean_token_accuracy": 0.8775845766067505, "num_tokens": 855840076.0, "step": 22429 }, { "epoch": 2.853326548785142, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.372886657714844, "learning_rate": 1e-06, "loss": 0.5983, "mean_token_accuracy": 0.8748975992202759, "num_tokens": 855877972.0, "step": 22430 }, { "epoch": 2.8534537590637323, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.98709487915039, "learning_rate": 1e-06, "loss": 0.6511, "mean_token_accuracy": 0.8569837212562561, "num_tokens": 855917424.0, "step": 22431 }, { "epoch": 2.853580969342323, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.25607681274414, "learning_rate": 1e-06, "loss": 0.6554, "mean_token_accuracy": 0.8529425263404846, "num_tokens": 855954853.0, "step": 22432 }, { "epoch": 2.8537081796209134, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.880523681640625, "learning_rate": 1e-06, "loss": 0.617, "mean_token_accuracy": 0.8656249046325684, "num_tokens": 855993300.0, "step": 22433 }, { "epoch": 2.853835389899504, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.26279067993164, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8654689192771912, "num_tokens": 856035307.0, "step": 22434 }, { "epoch": 2.8539626001780944, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.933135986328125, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.8619917631149292, "num_tokens": 856071801.0, "step": 22435 }, { "epoch": 2.854089810456685, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.214195251464844, "learning_rate": 1e-06, "loss": 0.6483, "mean_token_accuracy": 0.8523697257041931, "num_tokens": 856112035.0, "step": 22436 }, { "epoch": 2.8542170207352755, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.70417785644531, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.868097186088562, "num_tokens": 856148275.0, "step": 22437 }, { "epoch": 2.854344231013866, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.067508697509766, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8658032417297363, "num_tokens": 856180330.0, "step": 22438 }, { "epoch": 2.8544714412924566, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.81327819824219, "learning_rate": 1e-06, "loss": 0.6776, "mean_token_accuracy": 0.8486723899841309, "num_tokens": 856219354.0, "step": 22439 }, { "epoch": 2.854598651571047, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.49562072753906, "learning_rate": 1e-06, "loss": 0.5954, "mean_token_accuracy": 0.8736972808837891, "num_tokens": 856257583.0, "step": 22440 }, { "epoch": 2.8547258618496376, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.08333206176758, "learning_rate": 1e-06, "loss": 0.6778, "mean_token_accuracy": 0.8458987474441528, "num_tokens": 856298056.0, "step": 22441 }, { "epoch": 2.854853072128228, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.499752044677734, "learning_rate": 1e-06, "loss": 0.6426, "mean_token_accuracy": 0.8588213920593262, "num_tokens": 856334109.0, "step": 22442 }, { "epoch": 2.8549802824068182, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.794517517089844, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8663747310638428, "num_tokens": 856376655.0, "step": 22443 }, { "epoch": 2.855107492685409, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.393409729003906, "learning_rate": 1e-06, "loss": 0.5682, "mean_token_accuracy": 0.8795101046562195, "num_tokens": 856409926.0, "step": 22444 }, { "epoch": 2.8552347029639993, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.43925857543945, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.8525815606117249, "num_tokens": 856451471.0, "step": 22445 }, { "epoch": 2.8553619132425903, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.594085693359375, "learning_rate": 1e-06, "loss": 0.596, "mean_token_accuracy": 0.8724008202552795, "num_tokens": 856483237.0, "step": 22446 }, { "epoch": 2.8554891235211803, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.456119537353516, "learning_rate": 1e-06, "loss": 0.6662, "mean_token_accuracy": 0.8527987003326416, "num_tokens": 856521278.0, "step": 22447 }, { "epoch": 2.855616333799771, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.43101119995117, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8601648807525635, "num_tokens": 856565048.0, "step": 22448 }, { "epoch": 2.8557435440783614, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.48479080200195, "learning_rate": 1e-06, "loss": 0.6164, "mean_token_accuracy": 0.8673812747001648, "num_tokens": 856603756.0, "step": 22449 }, { "epoch": 2.855870754356952, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 51.98257064819336, "learning_rate": 1e-06, "loss": 0.6154, "mean_token_accuracy": 0.870071530342102, "num_tokens": 856640677.0, "step": 22450 }, { "epoch": 2.8559979646355425, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.53453826904297, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8647422790527344, "num_tokens": 856679982.0, "step": 22451 }, { "epoch": 2.856125174914133, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.264793395996094, "learning_rate": 1e-06, "loss": 0.6578, "mean_token_accuracy": 0.8513495922088623, "num_tokens": 856720621.0, "step": 22452 }, { "epoch": 2.8562523851927235, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.039878845214844, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8720122575759888, "num_tokens": 856757172.0, "step": 22453 }, { "epoch": 2.856379595471314, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.0596809387207, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8603658676147461, "num_tokens": 856796876.0, "step": 22454 }, { "epoch": 2.8565068057499046, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.7194709777832, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8620045185089111, "num_tokens": 856831148.0, "step": 22455 }, { "epoch": 2.856634016028495, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.56474304199219, "learning_rate": 1e-06, "loss": 0.6363, "mean_token_accuracy": 0.8583401441574097, "num_tokens": 856867091.0, "step": 22456 }, { "epoch": 2.8567612263070856, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.3077507019043, "learning_rate": 1e-06, "loss": 0.67, "mean_token_accuracy": 0.8511335849761963, "num_tokens": 856906118.0, "step": 22457 }, { "epoch": 2.856888436585676, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.94552230834961, "learning_rate": 1e-06, "loss": 0.7125, "mean_token_accuracy": 0.8369280099868774, "num_tokens": 856942694.0, "step": 22458 }, { "epoch": 2.8570156468642667, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.4034538269043, "learning_rate": 1e-06, "loss": 0.65, "mean_token_accuracy": 0.855056643486023, "num_tokens": 856977530.0, "step": 22459 }, { "epoch": 2.857142857142857, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.807098388671875, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8660430908203125, "num_tokens": 857012417.0, "step": 22460 }, { "epoch": 2.8572700674214477, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.05515670776367, "learning_rate": 1e-06, "loss": 0.6666, "mean_token_accuracy": 0.8566840887069702, "num_tokens": 857053556.0, "step": 22461 }, { "epoch": 2.8573972777000383, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.193363189697266, "learning_rate": 1e-06, "loss": 0.6152, "mean_token_accuracy": 0.8669450879096985, "num_tokens": 857094514.0, "step": 22462 }, { "epoch": 2.857524487978629, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.188743591308594, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8656255602836609, "num_tokens": 857133670.0, "step": 22463 }, { "epoch": 2.8576516982572193, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.23357009887695, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8545969724655151, "num_tokens": 857167934.0, "step": 22464 }, { "epoch": 2.85777890853581, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.6060791015625, "learning_rate": 1e-06, "loss": 0.6776, "mean_token_accuracy": 0.8452025651931763, "num_tokens": 857208263.0, "step": 22465 }, { "epoch": 2.8579061188144, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.60113525390625, "learning_rate": 1e-06, "loss": 0.6888, "mean_token_accuracy": 0.8449442386627197, "num_tokens": 857255107.0, "step": 22466 }, { "epoch": 2.858033329092991, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.068023681640625, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8606811165809631, "num_tokens": 857298368.0, "step": 22467 }, { "epoch": 2.858160539371581, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.31125259399414, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8632743954658508, "num_tokens": 857336899.0, "step": 22468 }, { "epoch": 2.858287749650172, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.314022064208984, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8669342994689941, "num_tokens": 857378482.0, "step": 22469 }, { "epoch": 2.858414959928762, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.44321060180664, "learning_rate": 1e-06, "loss": 0.6014, "mean_token_accuracy": 0.8717515468597412, "num_tokens": 857413203.0, "step": 22470 }, { "epoch": 2.858542170207353, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.28595733642578, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8602344989776611, "num_tokens": 857446706.0, "step": 22471 }, { "epoch": 2.858669380485943, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.26960372924805, "learning_rate": 1e-06, "loss": 0.6299, "mean_token_accuracy": 0.8677221536636353, "num_tokens": 857483415.0, "step": 22472 }, { "epoch": 2.8587965907645336, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.62022018432617, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8676159381866455, "num_tokens": 857525323.0, "step": 22473 }, { "epoch": 2.858923801043124, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.75639724731445, "learning_rate": 1e-06, "loss": 0.5904, "mean_token_accuracy": 0.8788094520568848, "num_tokens": 857564400.0, "step": 22474 }, { "epoch": 2.8590510113217147, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.851722717285156, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8605533838272095, "num_tokens": 857602756.0, "step": 22475 }, { "epoch": 2.859178221600305, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.56922149658203, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.863652229309082, "num_tokens": 857636875.0, "step": 22476 }, { "epoch": 2.8593054318788957, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.551856994628906, "learning_rate": 1e-06, "loss": 0.5775, "mean_token_accuracy": 0.879626989364624, "num_tokens": 857671950.0, "step": 22477 }, { "epoch": 2.8594326421574863, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.31327438354492, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.8757857084274292, "num_tokens": 857705274.0, "step": 22478 }, { "epoch": 2.859559852436077, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.26427459716797, "learning_rate": 1e-06, "loss": 0.646, "mean_token_accuracy": 0.8567823171615601, "num_tokens": 857743730.0, "step": 22479 }, { "epoch": 2.8596870627146673, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.13516616821289, "learning_rate": 1e-06, "loss": 0.6272, "mean_token_accuracy": 0.8605590462684631, "num_tokens": 857784055.0, "step": 22480 }, { "epoch": 2.859814272993258, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.662811279296875, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.868806004524231, "num_tokens": 857822431.0, "step": 22481 }, { "epoch": 2.8599414832718484, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.033634185791016, "learning_rate": 1e-06, "loss": 0.5893, "mean_token_accuracy": 0.8754295706748962, "num_tokens": 857859258.0, "step": 22482 }, { "epoch": 2.860068693550439, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.837100982666016, "learning_rate": 1e-06, "loss": 0.6548, "mean_token_accuracy": 0.8550612926483154, "num_tokens": 857901209.0, "step": 22483 }, { "epoch": 2.8601959038290294, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.92548751831055, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8757506608963013, "num_tokens": 857935563.0, "step": 22484 }, { "epoch": 2.86032311410762, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.48408126831055, "learning_rate": 1e-06, "loss": 0.6518, "mean_token_accuracy": 0.8540157079696655, "num_tokens": 857966777.0, "step": 22485 }, { "epoch": 2.8604503243862105, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.573143005371094, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8693305253982544, "num_tokens": 858005078.0, "step": 22486 }, { "epoch": 2.860577534664801, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.17279815673828, "learning_rate": 1e-06, "loss": 0.6667, "mean_token_accuracy": 0.8523086905479431, "num_tokens": 858049288.0, "step": 22487 }, { "epoch": 2.8607047449433916, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.986488342285156, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.88451087474823, "num_tokens": 858089834.0, "step": 22488 }, { "epoch": 2.860831955221982, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.41756820678711, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8685575723648071, "num_tokens": 858124152.0, "step": 22489 }, { "epoch": 2.8609591655005726, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.46492004394531, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8683944940567017, "num_tokens": 858163202.0, "step": 22490 }, { "epoch": 2.8610863757791627, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.53120422363281, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.8843964338302612, "num_tokens": 858199173.0, "step": 22491 }, { "epoch": 2.8612135860577537, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.29503631591797, "learning_rate": 1e-06, "loss": 0.6354, "mean_token_accuracy": 0.8612415194511414, "num_tokens": 858239693.0, "step": 22492 }, { "epoch": 2.8613407963363438, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.7984619140625, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8708367347717285, "num_tokens": 858280720.0, "step": 22493 }, { "epoch": 2.8614680066149347, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.8718376159668, "learning_rate": 1e-06, "loss": 0.5847, "mean_token_accuracy": 0.8761582970619202, "num_tokens": 858314145.0, "step": 22494 }, { "epoch": 2.861595216893525, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.074337005615234, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8649626970291138, "num_tokens": 858355177.0, "step": 22495 }, { "epoch": 2.861722427172116, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.457740783691406, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8667418956756592, "num_tokens": 858393070.0, "step": 22496 }, { "epoch": 2.861849637450706, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.412994384765625, "learning_rate": 1e-06, "loss": 0.6372, "mean_token_accuracy": 0.8638876676559448, "num_tokens": 858431827.0, "step": 22497 }, { "epoch": 2.8619768477292964, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.50003433227539, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8611719012260437, "num_tokens": 858474990.0, "step": 22498 }, { "epoch": 2.862104058007887, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.2555046081543, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.8769828081130981, "num_tokens": 858514698.0, "step": 22499 }, { "epoch": 2.8622312682864774, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.5337028503418, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8629156351089478, "num_tokens": 858557601.0, "step": 22500 }, { "epoch": 2.862358478565068, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.50079345703125, "learning_rate": 1e-06, "loss": 0.5953, "mean_token_accuracy": 0.8731154203414917, "num_tokens": 858596830.0, "step": 22501 }, { "epoch": 2.8624856888436585, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.48036575317383, "learning_rate": 1e-06, "loss": 0.6617, "mean_token_accuracy": 0.8541370630264282, "num_tokens": 858637555.0, "step": 22502 }, { "epoch": 2.862612899122249, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.45330810546875, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8841005563735962, "num_tokens": 858672813.0, "step": 22503 }, { "epoch": 2.8627401094008396, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.336639404296875, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8713557720184326, "num_tokens": 858710743.0, "step": 22504 }, { "epoch": 2.86286731967943, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.46306610107422, "learning_rate": 1e-06, "loss": 0.5798, "mean_token_accuracy": 0.8755576610565186, "num_tokens": 858739486.0, "step": 22505 }, { "epoch": 2.8629945299580206, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.45512008666992, "learning_rate": 1e-06, "loss": 0.672, "mean_token_accuracy": 0.8437327146530151, "num_tokens": 858780678.0, "step": 22506 }, { "epoch": 2.863121740236611, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.073646545410156, "learning_rate": 1e-06, "loss": 0.6449, "mean_token_accuracy": 0.8596578240394592, "num_tokens": 858826112.0, "step": 22507 }, { "epoch": 2.8632489505152017, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.06220626831055, "learning_rate": 1e-06, "loss": 0.5805, "mean_token_accuracy": 0.8763653039932251, "num_tokens": 858862158.0, "step": 22508 }, { "epoch": 2.863376160793792, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.092918395996094, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8686014413833618, "num_tokens": 858905270.0, "step": 22509 }, { "epoch": 2.8635033710723827, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.167728424072266, "learning_rate": 1e-06, "loss": 0.6317, "mean_token_accuracy": 0.8578401207923889, "num_tokens": 858941802.0, "step": 22510 }, { "epoch": 2.8636305813509733, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.08428955078125, "learning_rate": 1e-06, "loss": 0.6111, "mean_token_accuracy": 0.8665934801101685, "num_tokens": 858974672.0, "step": 22511 }, { "epoch": 2.863757791629564, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.108497619628906, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8626877069473267, "num_tokens": 859015305.0, "step": 22512 }, { "epoch": 2.8638850019081543, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.84006881713867, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8553149104118347, "num_tokens": 859050646.0, "step": 22513 }, { "epoch": 2.864012212186745, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.11058044433594, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8724470138549805, "num_tokens": 859090139.0, "step": 22514 }, { "epoch": 2.8641394224653354, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.201942443847656, "learning_rate": 1e-06, "loss": 0.5968, "mean_token_accuracy": 0.8682632446289062, "num_tokens": 859130661.0, "step": 22515 }, { "epoch": 2.8642666327439255, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.50117492675781, "learning_rate": 1e-06, "loss": 0.618, "mean_token_accuracy": 0.8636690378189087, "num_tokens": 859166399.0, "step": 22516 }, { "epoch": 2.8643938430225164, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.15428924560547, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.8649939894676208, "num_tokens": 859208349.0, "step": 22517 }, { "epoch": 2.8645210533011065, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.205501556396484, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8670889139175415, "num_tokens": 859243784.0, "step": 22518 }, { "epoch": 2.8646482635796975, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.50434112548828, "learning_rate": 1e-06, "loss": 0.6662, "mean_token_accuracy": 0.8502169251441956, "num_tokens": 859278345.0, "step": 22519 }, { "epoch": 2.8647754738582876, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.621150970458984, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8543410897254944, "num_tokens": 859317454.0, "step": 22520 }, { "epoch": 2.864902684136878, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.54604721069336, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.859760582447052, "num_tokens": 859352076.0, "step": 22521 }, { "epoch": 2.8650298944154686, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.92940139770508, "learning_rate": 1e-06, "loss": 0.5971, "mean_token_accuracy": 0.870474636554718, "num_tokens": 859389957.0, "step": 22522 }, { "epoch": 2.865157104694059, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.267337799072266, "learning_rate": 1e-06, "loss": 0.6199, "mean_token_accuracy": 0.8641984462738037, "num_tokens": 859425167.0, "step": 22523 }, { "epoch": 2.8652843149726497, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.104766845703125, "learning_rate": 1e-06, "loss": 0.628, "mean_token_accuracy": 0.861396849155426, "num_tokens": 859470999.0, "step": 22524 }, { "epoch": 2.86541152525124, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.106929779052734, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.869454562664032, "num_tokens": 859503763.0, "step": 22525 }, { "epoch": 2.8655387355298307, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 54.06800842285156, "learning_rate": 1e-06, "loss": 0.5907, "mean_token_accuracy": 0.8742133975028992, "num_tokens": 859537739.0, "step": 22526 }, { "epoch": 2.8656659458084213, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.36941909790039, "learning_rate": 1e-06, "loss": 0.5746, "mean_token_accuracy": 0.8753055930137634, "num_tokens": 859573497.0, "step": 22527 }, { "epoch": 2.865793156087012, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.67478942871094, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8601680994033813, "num_tokens": 859606997.0, "step": 22528 }, { "epoch": 2.8659203663656023, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.64057922363281, "learning_rate": 1e-06, "loss": 0.6535, "mean_token_accuracy": 0.8508739471435547, "num_tokens": 859642278.0, "step": 22529 }, { "epoch": 2.866047576644193, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.6307258605957, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8762412667274475, "num_tokens": 859684728.0, "step": 22530 }, { "epoch": 2.8661747869227834, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 53.56254577636719, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8564175963401794, "num_tokens": 859720946.0, "step": 22531 }, { "epoch": 2.866301997201374, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.84020233154297, "learning_rate": 1e-06, "loss": 0.6229, "mean_token_accuracy": 0.8600613474845886, "num_tokens": 859752605.0, "step": 22532 }, { "epoch": 2.8664292074799644, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.30366897583008, "learning_rate": 1e-06, "loss": 0.5806, "mean_token_accuracy": 0.8749587535858154, "num_tokens": 859784206.0, "step": 22533 }, { "epoch": 2.866556417758555, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.17274475097656, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8615075349807739, "num_tokens": 859828142.0, "step": 22534 }, { "epoch": 2.8666836280371455, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 53.39836120605469, "learning_rate": 1e-06, "loss": 0.6236, "mean_token_accuracy": 0.860029935836792, "num_tokens": 859870032.0, "step": 22535 }, { "epoch": 2.866810838315736, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 53.40666198730469, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8551305532455444, "num_tokens": 859911417.0, "step": 22536 }, { "epoch": 2.8669380485943265, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 53.471736907958984, "learning_rate": 1e-06, "loss": 0.5672, "mean_token_accuracy": 0.875033974647522, "num_tokens": 859953929.0, "step": 22537 }, { "epoch": 2.867065258872917, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.880863189697266, "learning_rate": 1e-06, "loss": 0.6015, "mean_token_accuracy": 0.8671921491622925, "num_tokens": 859985402.0, "step": 22538 }, { "epoch": 2.8671924691515076, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 53.102237701416016, "learning_rate": 1e-06, "loss": 0.6207, "mean_token_accuracy": 0.8616237640380859, "num_tokens": 860017374.0, "step": 22539 }, { "epoch": 2.867319679430098, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 54.83241653442383, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8739713430404663, "num_tokens": 860058217.0, "step": 22540 }, { "epoch": 2.867446889708688, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.44463348388672, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8480960130691528, "num_tokens": 860092317.0, "step": 22541 }, { "epoch": 2.867574099987279, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 54.395751953125, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8620810508728027, "num_tokens": 860125067.0, "step": 22542 }, { "epoch": 2.8677013102658693, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.596656799316406, "learning_rate": 1e-06, "loss": 0.5734, "mean_token_accuracy": 0.877394437789917, "num_tokens": 860163805.0, "step": 22543 }, { "epoch": 2.8678285205444602, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 54.47285842895508, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.8515467047691345, "num_tokens": 860200178.0, "step": 22544 }, { "epoch": 2.8679557308230503, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 52.84938430786133, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8636074066162109, "num_tokens": 860235283.0, "step": 22545 }, { "epoch": 2.868082941101641, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.71929931640625, "learning_rate": 1e-06, "loss": 0.5671, "mean_token_accuracy": 0.8816498517990112, "num_tokens": 860276776.0, "step": 22546 }, { "epoch": 2.8682101513802314, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 53.5068244934082, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8685730695724487, "num_tokens": 860318492.0, "step": 22547 }, { "epoch": 2.868337361658822, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.644447326660156, "learning_rate": 1e-06, "loss": 0.655, "mean_token_accuracy": 0.8489699959754944, "num_tokens": 860358556.0, "step": 22548 }, { "epoch": 2.8684645719374124, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 54.070823669433594, "learning_rate": 1e-06, "loss": 0.6402, "mean_token_accuracy": 0.8520756959915161, "num_tokens": 860391114.0, "step": 22549 }, { "epoch": 2.868591782216003, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.35547637939453, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8596326112747192, "num_tokens": 860430696.0, "step": 22550 }, { "epoch": 2.8687189924945935, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 54.27162551879883, "learning_rate": 1e-06, "loss": 0.5999, "mean_token_accuracy": 0.8717082738876343, "num_tokens": 860471055.0, "step": 22551 }, { "epoch": 2.868846202773184, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.42290115356445, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8582448959350586, "num_tokens": 860512472.0, "step": 22552 }, { "epoch": 2.8689734130517746, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 54.61845016479492, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8585038185119629, "num_tokens": 860541014.0, "step": 22553 }, { "epoch": 2.869100623330365, "ewc_loss": 0.21484375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019073486328125, "grad_norm": 52.62272262573242, "learning_rate": 1e-06, "loss": 0.5677, "mean_token_accuracy": 0.8794806003570557, "num_tokens": 860585025.0, "step": 22554 }, { "epoch": 2.8692278336089556, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.21067810058594, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8721619844436646, "num_tokens": 860616361.0, "step": 22555 }, { "epoch": 2.869355043887546, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.28433609008789, "learning_rate": 1e-06, "loss": 0.6536, "mean_token_accuracy": 0.8550553917884827, "num_tokens": 860655976.0, "step": 22556 }, { "epoch": 2.8694822541661367, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.45809555053711, "learning_rate": 1e-06, "loss": 0.667, "mean_token_accuracy": 0.8509762287139893, "num_tokens": 860703949.0, "step": 22557 }, { "epoch": 2.869609464444727, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.661136627197266, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8628285527229309, "num_tokens": 860739935.0, "step": 22558 }, { "epoch": 2.8697366747233177, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.07083511352539, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.868743896484375, "num_tokens": 860782483.0, "step": 22559 }, { "epoch": 2.8698638850019083, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.84651184082031, "learning_rate": 1e-06, "loss": 0.6484, "mean_token_accuracy": 0.8530963659286499, "num_tokens": 860816821.0, "step": 22560 }, { "epoch": 2.869991095280499, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.98326873779297, "learning_rate": 1e-06, "loss": 0.6593, "mean_token_accuracy": 0.8523503541946411, "num_tokens": 860851313.0, "step": 22561 }, { "epoch": 2.8701183055590893, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.10692596435547, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8724281787872314, "num_tokens": 860890540.0, "step": 22562 }, { "epoch": 2.87024551583768, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.512020111083984, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8564027547836304, "num_tokens": 860929686.0, "step": 22563 }, { "epoch": 2.87037272611627, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.31325149536133, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8657712340354919, "num_tokens": 860968186.0, "step": 22564 }, { "epoch": 2.870499936394861, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.07156753540039, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8586825132369995, "num_tokens": 861007252.0, "step": 22565 }, { "epoch": 2.870627146673451, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.957252502441406, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8660001754760742, "num_tokens": 861049898.0, "step": 22566 }, { "epoch": 2.870754356952042, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.134769439697266, "learning_rate": 1e-06, "loss": 0.5978, "mean_token_accuracy": 0.8713170289993286, "num_tokens": 861087711.0, "step": 22567 }, { "epoch": 2.870881567230632, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.115440368652344, "learning_rate": 1e-06, "loss": 0.6848, "mean_token_accuracy": 0.843552827835083, "num_tokens": 861128566.0, "step": 22568 }, { "epoch": 2.871008777509223, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.053314208984375, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8685204982757568, "num_tokens": 861156299.0, "step": 22569 }, { "epoch": 2.871135987787813, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.728050231933594, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8739519119262695, "num_tokens": 861189376.0, "step": 22570 }, { "epoch": 2.8712631980664036, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.73139572143555, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8590444922447205, "num_tokens": 861227156.0, "step": 22571 }, { "epoch": 2.871390408344994, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.72374725341797, "learning_rate": 1e-06, "loss": 0.6268, "mean_token_accuracy": 0.8630971312522888, "num_tokens": 861268088.0, "step": 22572 }, { "epoch": 2.8715176186235847, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.83295440673828, "learning_rate": 1e-06, "loss": 0.6144, "mean_token_accuracy": 0.8655205965042114, "num_tokens": 861308892.0, "step": 22573 }, { "epoch": 2.871644828902175, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.20071792602539, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8678792715072632, "num_tokens": 861350108.0, "step": 22574 }, { "epoch": 2.8717720391807657, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.84308624267578, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.8747072219848633, "num_tokens": 861390553.0, "step": 22575 }, { "epoch": 2.8718992494593563, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.950679779052734, "learning_rate": 1e-06, "loss": 0.6017, "mean_token_accuracy": 0.8649388551712036, "num_tokens": 861425878.0, "step": 22576 }, { "epoch": 2.872026459737947, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.85033416748047, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8791467547416687, "num_tokens": 861460935.0, "step": 22577 }, { "epoch": 2.8721536700165373, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.1421012878418, "learning_rate": 1e-06, "loss": 0.6781, "mean_token_accuracy": 0.8469763398170471, "num_tokens": 861500446.0, "step": 22578 }, { "epoch": 2.872280880295128, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.24047088623047, "learning_rate": 1e-06, "loss": 0.6558, "mean_token_accuracy": 0.8514912128448486, "num_tokens": 861538000.0, "step": 22579 }, { "epoch": 2.8724080905737184, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.466312408447266, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8668323755264282, "num_tokens": 861578052.0, "step": 22580 }, { "epoch": 2.872535300852309, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.09800338745117, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8616775274276733, "num_tokens": 861619065.0, "step": 22581 }, { "epoch": 2.8726625111308994, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.85869598388672, "learning_rate": 1e-06, "loss": 0.6296, "mean_token_accuracy": 0.8679134845733643, "num_tokens": 861660112.0, "step": 22582 }, { "epoch": 2.87278972140949, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 51.971160888671875, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8663217425346375, "num_tokens": 861699312.0, "step": 22583 }, { "epoch": 2.8729169316880805, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.998355865478516, "learning_rate": 1e-06, "loss": 0.5764, "mean_token_accuracy": 0.8813064098358154, "num_tokens": 861735167.0, "step": 22584 }, { "epoch": 2.873044141966671, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.105525970458984, "learning_rate": 1e-06, "loss": 0.6395, "mean_token_accuracy": 0.8571019768714905, "num_tokens": 861778594.0, "step": 22585 }, { "epoch": 2.8731713522452615, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.60898208618164, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8626784682273865, "num_tokens": 861814754.0, "step": 22586 }, { "epoch": 2.873298562523852, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.17301940917969, "learning_rate": 1e-06, "loss": 0.6584, "mean_token_accuracy": 0.8477631211280823, "num_tokens": 861849019.0, "step": 22587 }, { "epoch": 2.8734257728024426, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.2378044128418, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.859721302986145, "num_tokens": 861889471.0, "step": 22588 }, { "epoch": 2.8735529830810327, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.625244140625, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8551562428474426, "num_tokens": 861926811.0, "step": 22589 }, { "epoch": 2.8736801933596237, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.976322174072266, "learning_rate": 1e-06, "loss": 0.6594, "mean_token_accuracy": 0.861521303653717, "num_tokens": 861963628.0, "step": 22590 }, { "epoch": 2.8738074036382137, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.381683349609375, "learning_rate": 1e-06, "loss": 0.6578, "mean_token_accuracy": 0.8491860628128052, "num_tokens": 862002915.0, "step": 22591 }, { "epoch": 2.8739346139168047, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.727481842041016, "learning_rate": 1e-06, "loss": 0.6066, "mean_token_accuracy": 0.8699651956558228, "num_tokens": 862047276.0, "step": 22592 }, { "epoch": 2.874061824195395, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.36448287963867, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8705017566680908, "num_tokens": 862091241.0, "step": 22593 }, { "epoch": 2.8741890344739858, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.69440460205078, "learning_rate": 1e-06, "loss": 0.6088, "mean_token_accuracy": 0.866996705532074, "num_tokens": 862126025.0, "step": 22594 }, { "epoch": 2.874316244752576, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.911617279052734, "learning_rate": 1e-06, "loss": 0.6559, "mean_token_accuracy": 0.8531248569488525, "num_tokens": 862160822.0, "step": 22595 }, { "epoch": 2.8744434550311664, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.90995788574219, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8684831857681274, "num_tokens": 862196808.0, "step": 22596 }, { "epoch": 2.874570665309757, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.36543655395508, "learning_rate": 1e-06, "loss": 0.6815, "mean_token_accuracy": 0.8485237956047058, "num_tokens": 862233722.0, "step": 22597 }, { "epoch": 2.8746978755883474, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.841548919677734, "learning_rate": 1e-06, "loss": 0.5744, "mean_token_accuracy": 0.8803637027740479, "num_tokens": 862271914.0, "step": 22598 }, { "epoch": 2.874825085866938, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.46302032470703, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8613902926445007, "num_tokens": 862313696.0, "step": 22599 }, { "epoch": 2.8749522961455285, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.45404815673828, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.8562437295913696, "num_tokens": 862355366.0, "step": 22600 }, { "epoch": 2.875079506424119, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.402137756347656, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8533398509025574, "num_tokens": 862395630.0, "step": 22601 }, { "epoch": 2.8752067167027096, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.20524978637695, "learning_rate": 1e-06, "loss": 0.6162, "mean_token_accuracy": 0.8621175289154053, "num_tokens": 862429650.0, "step": 22602 }, { "epoch": 2.8753339269813, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.225032806396484, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8615213632583618, "num_tokens": 862470921.0, "step": 22603 }, { "epoch": 2.8754611372598906, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.25627517700195, "learning_rate": 1e-06, "loss": 0.5937, "mean_token_accuracy": 0.8690451979637146, "num_tokens": 862505204.0, "step": 22604 }, { "epoch": 2.875588347538481, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.6656608581543, "learning_rate": 1e-06, "loss": 0.6439, "mean_token_accuracy": 0.8622896075248718, "num_tokens": 862548686.0, "step": 22605 }, { "epoch": 2.8757155578170717, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.28926467895508, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8779160976409912, "num_tokens": 862585920.0, "step": 22606 }, { "epoch": 2.875842768095662, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.635009765625, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.8586117625236511, "num_tokens": 862619364.0, "step": 22607 }, { "epoch": 2.8759699783742527, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.2330207824707, "learning_rate": 1e-06, "loss": 0.6476, "mean_token_accuracy": 0.8584145903587341, "num_tokens": 862661317.0, "step": 22608 }, { "epoch": 2.8760971886528433, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.78322219848633, "learning_rate": 1e-06, "loss": 0.6023, "mean_token_accuracy": 0.8677610754966736, "num_tokens": 862698236.0, "step": 22609 }, { "epoch": 2.876224398931434, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.50627899169922, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8748785257339478, "num_tokens": 862739224.0, "step": 22610 }, { "epoch": 2.8763516092100243, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.67326354980469, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8786582946777344, "num_tokens": 862776534.0, "step": 22611 }, { "epoch": 2.876478819488615, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.28203582763672, "learning_rate": 1e-06, "loss": 0.6516, "mean_token_accuracy": 0.8552037477493286, "num_tokens": 862815690.0, "step": 22612 }, { "epoch": 2.8766060297672054, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.856910705566406, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8631129264831543, "num_tokens": 862851021.0, "step": 22613 }, { "epoch": 2.8767332400457954, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.50709915161133, "learning_rate": 1e-06, "loss": 0.5943, "mean_token_accuracy": 0.8711829781532288, "num_tokens": 862885122.0, "step": 22614 }, { "epoch": 2.8768604503243864, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.565608978271484, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8699749708175659, "num_tokens": 862922233.0, "step": 22615 }, { "epoch": 2.8769876606029765, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.14335632324219, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8755502700805664, "num_tokens": 862961355.0, "step": 22616 }, { "epoch": 2.8771148708815675, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.596832275390625, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.873418927192688, "num_tokens": 862998523.0, "step": 22617 }, { "epoch": 2.8772420811601576, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 51.84406280517578, "learning_rate": 1e-06, "loss": 0.6172, "mean_token_accuracy": 0.8629205226898193, "num_tokens": 863039684.0, "step": 22618 }, { "epoch": 2.877369291438748, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.1789436340332, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8681506514549255, "num_tokens": 863077649.0, "step": 22619 }, { "epoch": 2.8774965017173386, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.43739318847656, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8592705130577087, "num_tokens": 863116816.0, "step": 22620 }, { "epoch": 2.877623711995929, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.67342758178711, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.8756341338157654, "num_tokens": 863155306.0, "step": 22621 }, { "epoch": 2.8777509222745197, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.619117736816406, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8708314895629883, "num_tokens": 863194381.0, "step": 22622 }, { "epoch": 2.87787813255311, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.42715835571289, "learning_rate": 1e-06, "loss": 0.6391, "mean_token_accuracy": 0.8626604080200195, "num_tokens": 863231470.0, "step": 22623 }, { "epoch": 2.8780053428317007, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.77501678466797, "learning_rate": 1e-06, "loss": 0.6566, "mean_token_accuracy": 0.8563511371612549, "num_tokens": 863265617.0, "step": 22624 }, { "epoch": 2.8781325531102913, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.31394577026367, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.855300784111023, "num_tokens": 863300111.0, "step": 22625 }, { "epoch": 2.878259763388882, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.02713394165039, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.8709518313407898, "num_tokens": 863338626.0, "step": 22626 }, { "epoch": 2.8783869736674723, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.12963104248047, "learning_rate": 1e-06, "loss": 0.6633, "mean_token_accuracy": 0.8551342487335205, "num_tokens": 863374672.0, "step": 22627 }, { "epoch": 2.878514183946063, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.0911865234375, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8579657077789307, "num_tokens": 863412717.0, "step": 22628 }, { "epoch": 2.8786413942246534, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.68024826049805, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8714131116867065, "num_tokens": 863447776.0, "step": 22629 }, { "epoch": 2.878768604503244, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.3445930480957, "learning_rate": 1e-06, "loss": 0.5733, "mean_token_accuracy": 0.8818784952163696, "num_tokens": 863485697.0, "step": 22630 }, { "epoch": 2.8788958147818344, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.41181182861328, "learning_rate": 1e-06, "loss": 0.6338, "mean_token_accuracy": 0.8588945865631104, "num_tokens": 863528706.0, "step": 22631 }, { "epoch": 2.879023025060425, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.716304779052734, "learning_rate": 1e-06, "loss": 0.6789, "mean_token_accuracy": 0.8465061783790588, "num_tokens": 863569181.0, "step": 22632 }, { "epoch": 2.8791502353390155, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.00309371948242, "learning_rate": 1e-06, "loss": 0.6654, "mean_token_accuracy": 0.8476355075836182, "num_tokens": 863602985.0, "step": 22633 }, { "epoch": 2.879277445617606, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.99136734008789, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8683673143386841, "num_tokens": 863634590.0, "step": 22634 }, { "epoch": 2.8794046558961965, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.18129348754883, "learning_rate": 1e-06, "loss": 0.6378, "mean_token_accuracy": 0.8636593818664551, "num_tokens": 863677671.0, "step": 22635 }, { "epoch": 2.879531866174787, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 54.00664138793945, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8663935661315918, "num_tokens": 863716102.0, "step": 22636 }, { "epoch": 2.8796590764533776, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.426971435546875, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8558692336082458, "num_tokens": 863751560.0, "step": 22637 }, { "epoch": 2.879786286731968, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.65342330932617, "learning_rate": 1e-06, "loss": 0.5718, "mean_token_accuracy": 0.8791584968566895, "num_tokens": 863792373.0, "step": 22638 }, { "epoch": 2.879913497010558, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.37336730957031, "learning_rate": 1e-06, "loss": 0.6519, "mean_token_accuracy": 0.8609418869018555, "num_tokens": 863829969.0, "step": 22639 }, { "epoch": 2.880040707289149, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.47921371459961, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.8708406090736389, "num_tokens": 863867080.0, "step": 22640 }, { "epoch": 2.8801679175677393, "ewc_loss": 0.216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001926422119140625, "grad_norm": 52.52450180053711, "learning_rate": 1e-06, "loss": 0.5956, "mean_token_accuracy": 0.8680784106254578, "num_tokens": 863904230.0, "step": 22641 }, { "epoch": 2.8802951278463302, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 54.38308334350586, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.8690253496170044, "num_tokens": 863947309.0, "step": 22642 }, { "epoch": 2.8804223381249203, "ewc_loss": 0.2177734375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019359588623046875, "grad_norm": 52.74552917480469, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8702866435050964, "num_tokens": 863985557.0, "step": 22643 }, { "epoch": 2.880549548403511, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.01502227783203, "learning_rate": 1e-06, "loss": 0.6627, "mean_token_accuracy": 0.8531337976455688, "num_tokens": 864021842.0, "step": 22644 }, { "epoch": 2.8806767586821014, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.85246658325195, "learning_rate": 1e-06, "loss": 0.6396, "mean_token_accuracy": 0.8566299080848694, "num_tokens": 864060765.0, "step": 22645 }, { "epoch": 2.880803968960692, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.38463592529297, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.865199625492096, "num_tokens": 864100624.0, "step": 22646 }, { "epoch": 2.8809311792392824, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.495567321777344, "learning_rate": 1e-06, "loss": 0.5841, "mean_token_accuracy": 0.8748338222503662, "num_tokens": 864136599.0, "step": 22647 }, { "epoch": 2.881058389517873, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.29016876220703, "learning_rate": 1e-06, "loss": 0.6072, "mean_token_accuracy": 0.8681820631027222, "num_tokens": 864169876.0, "step": 22648 }, { "epoch": 2.8811855997964635, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.06917953491211, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8632849454879761, "num_tokens": 864213709.0, "step": 22649 }, { "epoch": 2.881312810075054, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.49190139770508, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8698465824127197, "num_tokens": 864245581.0, "step": 22650 }, { "epoch": 2.8814400203536445, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.167625427246094, "learning_rate": 1e-06, "loss": 0.6756, "mean_token_accuracy": 0.8497081995010376, "num_tokens": 864287081.0, "step": 22651 }, { "epoch": 2.881567230632235, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.93722152709961, "learning_rate": 1e-06, "loss": 0.677, "mean_token_accuracy": 0.8479695320129395, "num_tokens": 864321480.0, "step": 22652 }, { "epoch": 2.8816944409108256, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.130123138427734, "learning_rate": 1e-06, "loss": 0.6022, "mean_token_accuracy": 0.8705929517745972, "num_tokens": 864358580.0, "step": 22653 }, { "epoch": 2.881821651189416, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.56965255737305, "learning_rate": 1e-06, "loss": 0.6488, "mean_token_accuracy": 0.8537479043006897, "num_tokens": 864396277.0, "step": 22654 }, { "epoch": 2.8819488614680067, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.03658676147461, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8603571057319641, "num_tokens": 864433282.0, "step": 22655 }, { "epoch": 2.882076071746597, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.71265411376953, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8555941581726074, "num_tokens": 864469305.0, "step": 22656 }, { "epoch": 2.8822032820251877, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.430171966552734, "learning_rate": 1e-06, "loss": 0.6213, "mean_token_accuracy": 0.8647671937942505, "num_tokens": 864507246.0, "step": 22657 }, { "epoch": 2.8823304923037782, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.162147521972656, "learning_rate": 1e-06, "loss": 0.6408, "mean_token_accuracy": 0.8582478761672974, "num_tokens": 864547315.0, "step": 22658 }, { "epoch": 2.8824577025823688, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.32791519165039, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8573130965232849, "num_tokens": 864590813.0, "step": 22659 }, { "epoch": 2.8825849128609593, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.13240051269531, "learning_rate": 1e-06, "loss": 0.6538, "mean_token_accuracy": 0.855023980140686, "num_tokens": 864625078.0, "step": 22660 }, { "epoch": 2.88271212313955, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.270565032958984, "learning_rate": 1e-06, "loss": 0.5992, "mean_token_accuracy": 0.8713339567184448, "num_tokens": 864663714.0, "step": 22661 }, { "epoch": 2.88283933341814, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.78174591064453, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.8622052669525146, "num_tokens": 864697026.0, "step": 22662 }, { "epoch": 2.882966543696731, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.37112045288086, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8730195164680481, "num_tokens": 864732337.0, "step": 22663 }, { "epoch": 2.883093753975321, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.7160758972168, "learning_rate": 1e-06, "loss": 0.643, "mean_token_accuracy": 0.8572113513946533, "num_tokens": 864776598.0, "step": 22664 }, { "epoch": 2.883220964253912, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.7176628112793, "learning_rate": 1e-06, "loss": 0.6763, "mean_token_accuracy": 0.8506613969802856, "num_tokens": 864817279.0, "step": 22665 }, { "epoch": 2.883348174532502, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.77835464477539, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8722679018974304, "num_tokens": 864854914.0, "step": 22666 }, { "epoch": 2.883475384811093, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.49166488647461, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.867114782333374, "num_tokens": 864892914.0, "step": 22667 }, { "epoch": 2.883602595089683, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.2045783996582, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8614312410354614, "num_tokens": 864934189.0, "step": 22668 }, { "epoch": 2.8837298053682736, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.2787971496582, "learning_rate": 1e-06, "loss": 0.6006, "mean_token_accuracy": 0.8740437030792236, "num_tokens": 864972085.0, "step": 22669 }, { "epoch": 2.883857015646864, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.41262435913086, "learning_rate": 1e-06, "loss": 0.6797, "mean_token_accuracy": 0.8478644490242004, "num_tokens": 865008674.0, "step": 22670 }, { "epoch": 2.8839842259254547, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.01078796386719, "learning_rate": 1e-06, "loss": 0.6488, "mean_token_accuracy": 0.8533228635787964, "num_tokens": 865046004.0, "step": 22671 }, { "epoch": 2.884111436204045, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.576927185058594, "learning_rate": 1e-06, "loss": 0.6063, "mean_token_accuracy": 0.8659612536430359, "num_tokens": 865082409.0, "step": 22672 }, { "epoch": 2.8842386464826357, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.7209587097168, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8622018694877625, "num_tokens": 865124457.0, "step": 22673 }, { "epoch": 2.8843658567612263, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.61627960205078, "learning_rate": 1e-06, "loss": 0.5964, "mean_token_accuracy": 0.875657320022583, "num_tokens": 865162290.0, "step": 22674 }, { "epoch": 2.884493067039817, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.3240852355957, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8589582443237305, "num_tokens": 865201686.0, "step": 22675 }, { "epoch": 2.8846202773184073, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.03487014770508, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8515714406967163, "num_tokens": 865236876.0, "step": 22676 }, { "epoch": 2.884747487596998, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.67308807373047, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8680798411369324, "num_tokens": 865276994.0, "step": 22677 }, { "epoch": 2.8848746978755884, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.79384231567383, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8623456954956055, "num_tokens": 865311990.0, "step": 22678 }, { "epoch": 2.885001908154179, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.11451721191406, "learning_rate": 1e-06, "loss": 0.6106, "mean_token_accuracy": 0.8684590458869934, "num_tokens": 865352324.0, "step": 22679 }, { "epoch": 2.8851291184327694, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.2742805480957, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8627703189849854, "num_tokens": 865391428.0, "step": 22680 }, { "epoch": 2.88525632871136, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.30860900878906, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8598635196685791, "num_tokens": 865428478.0, "step": 22681 }, { "epoch": 2.8853835389899505, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.48239517211914, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.863207221031189, "num_tokens": 865466531.0, "step": 22682 }, { "epoch": 2.885510749268541, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.86384582519531, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8716274499893188, "num_tokens": 865503177.0, "step": 22683 }, { "epoch": 2.8856379595471315, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.08287048339844, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8557406067848206, "num_tokens": 865539454.0, "step": 22684 }, { "epoch": 2.885765169825722, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.58230972290039, "learning_rate": 1e-06, "loss": 0.5693, "mean_token_accuracy": 0.8817956447601318, "num_tokens": 865571937.0, "step": 22685 }, { "epoch": 2.8858923801043126, "ewc_loss": 0.21875, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.24765396118164, "learning_rate": 1e-06, "loss": 0.608, "mean_token_accuracy": 0.8655983805656433, "num_tokens": 865609457.0, "step": 22686 }, { "epoch": 2.8860195903829027, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.86833572387695, "learning_rate": 1e-06, "loss": 0.6684, "mean_token_accuracy": 0.8504043817520142, "num_tokens": 865647391.0, "step": 22687 }, { "epoch": 2.8861468006614936, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.2966194152832, "learning_rate": 1e-06, "loss": 0.6195, "mean_token_accuracy": 0.8628095388412476, "num_tokens": 865680578.0, "step": 22688 }, { "epoch": 2.8862740109400837, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.29421615600586, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8590574264526367, "num_tokens": 865714610.0, "step": 22689 }, { "epoch": 2.8864012212186747, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.20452880859375, "learning_rate": 1e-06, "loss": 0.6522, "mean_token_accuracy": 0.8594201803207397, "num_tokens": 865755672.0, "step": 22690 }, { "epoch": 2.886528431497265, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.24606704711914, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8691199421882629, "num_tokens": 865790111.0, "step": 22691 }, { "epoch": 2.8866556417758558, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.99940490722656, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.86249840259552, "num_tokens": 865825580.0, "step": 22692 }, { "epoch": 2.886782852054446, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.63758087158203, "learning_rate": 1e-06, "loss": 0.6049, "mean_token_accuracy": 0.8672765493392944, "num_tokens": 865869099.0, "step": 22693 }, { "epoch": 2.8869100623330364, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.07481002807617, "learning_rate": 1e-06, "loss": 0.6629, "mean_token_accuracy": 0.8548151254653931, "num_tokens": 865906957.0, "step": 22694 }, { "epoch": 2.887037272611627, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.31878662109375, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8734132051467896, "num_tokens": 865945712.0, "step": 22695 }, { "epoch": 2.8871644828902174, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.109718322753906, "learning_rate": 1e-06, "loss": 0.6673, "mean_token_accuracy": 0.8500701785087585, "num_tokens": 865984469.0, "step": 22696 }, { "epoch": 2.887291693168808, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.43950271606445, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8647754788398743, "num_tokens": 866024624.0, "step": 22697 }, { "epoch": 2.8874189034473985, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.270225524902344, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8604158163070679, "num_tokens": 866062446.0, "step": 22698 }, { "epoch": 2.887546113725989, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.69919204711914, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8740355372428894, "num_tokens": 866100896.0, "step": 22699 }, { "epoch": 2.8876733240045795, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.34793472290039, "learning_rate": 1e-06, "loss": 0.6601, "mean_token_accuracy": 0.8500384092330933, "num_tokens": 866134668.0, "step": 22700 }, { "epoch": 2.88780053428317, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.62501525878906, "learning_rate": 1e-06, "loss": 0.6514, "mean_token_accuracy": 0.8562113046646118, "num_tokens": 866173834.0, "step": 22701 }, { "epoch": 2.8879277445617606, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.8709602355957, "learning_rate": 1e-06, "loss": 0.6973, "mean_token_accuracy": 0.8450883030891418, "num_tokens": 866211691.0, "step": 22702 }, { "epoch": 2.888054954840351, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.30467987060547, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.861893355846405, "num_tokens": 866240878.0, "step": 22703 }, { "epoch": 2.8881821651189417, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.1728401184082, "learning_rate": 1e-06, "loss": 0.6611, "mean_token_accuracy": 0.8492741584777832, "num_tokens": 866289552.0, "step": 22704 }, { "epoch": 2.888309375397532, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.15677261352539, "learning_rate": 1e-06, "loss": 0.6622, "mean_token_accuracy": 0.8534764051437378, "num_tokens": 866330361.0, "step": 22705 }, { "epoch": 2.8884365856761227, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.547000885009766, "learning_rate": 1e-06, "loss": 0.6187, "mean_token_accuracy": 0.8671972751617432, "num_tokens": 866371557.0, "step": 22706 }, { "epoch": 2.8885637959547132, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.97577667236328, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8641797304153442, "num_tokens": 866410627.0, "step": 22707 }, { "epoch": 2.8886910062333038, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.523681640625, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.8616701364517212, "num_tokens": 866446036.0, "step": 22708 }, { "epoch": 2.8888182165118943, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.65311813354492, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8536860942840576, "num_tokens": 866489655.0, "step": 22709 }, { "epoch": 2.888945426790485, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.387001037597656, "learning_rate": 1e-06, "loss": 0.6575, "mean_token_accuracy": 0.8557828068733215, "num_tokens": 866532750.0, "step": 22710 }, { "epoch": 2.8890726370690754, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.50193786621094, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8689939975738525, "num_tokens": 866570748.0, "step": 22711 }, { "epoch": 2.8891998473476654, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.60371017456055, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8576141595840454, "num_tokens": 866609422.0, "step": 22712 }, { "epoch": 2.8893270576262564, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.455955505371094, "learning_rate": 1e-06, "loss": 0.6166, "mean_token_accuracy": 0.8641378283500671, "num_tokens": 866650865.0, "step": 22713 }, { "epoch": 2.8894542679048465, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.49024963378906, "learning_rate": 1e-06, "loss": 0.6342, "mean_token_accuracy": 0.8612937927246094, "num_tokens": 866687605.0, "step": 22714 }, { "epoch": 2.8895814781834375, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.232818603515625, "learning_rate": 1e-06, "loss": 0.6474, "mean_token_accuracy": 0.8525756001472473, "num_tokens": 866722974.0, "step": 22715 }, { "epoch": 2.8897086884620276, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.52596664428711, "learning_rate": 1e-06, "loss": 0.6347, "mean_token_accuracy": 0.8609673976898193, "num_tokens": 866760694.0, "step": 22716 }, { "epoch": 2.889835898740618, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.38701629638672, "learning_rate": 1e-06, "loss": 0.675, "mean_token_accuracy": 0.8435127139091492, "num_tokens": 866795102.0, "step": 22717 }, { "epoch": 2.8899631090192086, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.717506408691406, "learning_rate": 1e-06, "loss": 0.5779, "mean_token_accuracy": 0.8772038221359253, "num_tokens": 866831282.0, "step": 22718 }, { "epoch": 2.890090319297799, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.47492599487305, "learning_rate": 1e-06, "loss": 0.655, "mean_token_accuracy": 0.8537717461585999, "num_tokens": 866872661.0, "step": 22719 }, { "epoch": 2.8902175295763897, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.44831466674805, "learning_rate": 1e-06, "loss": 0.6476, "mean_token_accuracy": 0.8531239032745361, "num_tokens": 866917576.0, "step": 22720 }, { "epoch": 2.89034473985498, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.4334602355957, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.8672511577606201, "num_tokens": 866957245.0, "step": 22721 }, { "epoch": 2.8904719501335707, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.9339714050293, "learning_rate": 1e-06, "loss": 0.7026, "mean_token_accuracy": 0.8346045017242432, "num_tokens": 866998695.0, "step": 22722 }, { "epoch": 2.8905991604121613, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.24705123901367, "learning_rate": 1e-06, "loss": 0.672, "mean_token_accuracy": 0.8489071726799011, "num_tokens": 867033502.0, "step": 22723 }, { "epoch": 2.890726370690752, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.06413650512695, "learning_rate": 1e-06, "loss": 0.674, "mean_token_accuracy": 0.8449615240097046, "num_tokens": 867076032.0, "step": 22724 }, { "epoch": 2.8908535809693423, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.21961212158203, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8625518679618835, "num_tokens": 867107429.0, "step": 22725 }, { "epoch": 2.890980791247933, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.88953399658203, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.865595817565918, "num_tokens": 867139432.0, "step": 22726 }, { "epoch": 2.8911080015265234, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.790225982666016, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8591114282608032, "num_tokens": 867180588.0, "step": 22727 }, { "epoch": 2.891235211805114, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.733978271484375, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8590216636657715, "num_tokens": 867215170.0, "step": 22728 }, { "epoch": 2.8913624220837044, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.04155349731445, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8795194625854492, "num_tokens": 867254958.0, "step": 22729 }, { "epoch": 2.891489632362295, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.521610260009766, "learning_rate": 1e-06, "loss": 0.6345, "mean_token_accuracy": 0.8607699871063232, "num_tokens": 867291135.0, "step": 22730 }, { "epoch": 2.8916168426408855, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.539859771728516, "learning_rate": 1e-06, "loss": 0.6186, "mean_token_accuracy": 0.8694353103637695, "num_tokens": 867326926.0, "step": 22731 }, { "epoch": 2.891744052919476, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.78730392456055, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8535760641098022, "num_tokens": 867361142.0, "step": 22732 }, { "epoch": 2.8918712631980665, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.56930160522461, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8801894187927246, "num_tokens": 867398344.0, "step": 22733 }, { "epoch": 2.891998473476657, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.4199104309082, "learning_rate": 1e-06, "loss": 0.5991, "mean_token_accuracy": 0.871554970741272, "num_tokens": 867437642.0, "step": 22734 }, { "epoch": 2.8921256837552476, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.12940216064453, "learning_rate": 1e-06, "loss": 0.6732, "mean_token_accuracy": 0.8511440753936768, "num_tokens": 867471646.0, "step": 22735 }, { "epoch": 2.892252894033838, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 51.852230072021484, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8637314438819885, "num_tokens": 867509682.0, "step": 22736 }, { "epoch": 2.892380104312428, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 54.22884750366211, "learning_rate": 1e-06, "loss": 0.6627, "mean_token_accuracy": 0.8589074611663818, "num_tokens": 867544425.0, "step": 22737 }, { "epoch": 2.892507314591019, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.389305114746094, "learning_rate": 1e-06, "loss": 0.6423, "mean_token_accuracy": 0.8562815189361572, "num_tokens": 867584978.0, "step": 22738 }, { "epoch": 2.8926345248696093, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.66135025024414, "learning_rate": 1e-06, "loss": 0.6305, "mean_token_accuracy": 0.8641262054443359, "num_tokens": 867620360.0, "step": 22739 }, { "epoch": 2.8927617351482002, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.116783142089844, "learning_rate": 1e-06, "loss": 0.5924, "mean_token_accuracy": 0.8717806339263916, "num_tokens": 867655246.0, "step": 22740 }, { "epoch": 2.8928889454267903, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.66486358642578, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8531463146209717, "num_tokens": 867692792.0, "step": 22741 }, { "epoch": 2.893016155705381, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.44273376464844, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8653735518455505, "num_tokens": 867732365.0, "step": 22742 }, { "epoch": 2.8931433659839714, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.4930419921875, "learning_rate": 1e-06, "loss": 0.6774, "mean_token_accuracy": 0.8474915027618408, "num_tokens": 867764800.0, "step": 22743 }, { "epoch": 2.893270576262562, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.5696907043457, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.856047511100769, "num_tokens": 867807107.0, "step": 22744 }, { "epoch": 2.8933977865411524, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.08584976196289, "learning_rate": 1e-06, "loss": 0.6596, "mean_token_accuracy": 0.852363109588623, "num_tokens": 867850225.0, "step": 22745 }, { "epoch": 2.893524996819743, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.851768493652344, "learning_rate": 1e-06, "loss": 0.6405, "mean_token_accuracy": 0.8588443398475647, "num_tokens": 867890457.0, "step": 22746 }, { "epoch": 2.8936522070983335, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.918636322021484, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.8687768578529358, "num_tokens": 867925880.0, "step": 22747 }, { "epoch": 2.893779417376924, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.67695617675781, "learning_rate": 1e-06, "loss": 0.5769, "mean_token_accuracy": 0.8763048648834229, "num_tokens": 867963923.0, "step": 22748 }, { "epoch": 2.8939066276555145, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.9547233581543, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8660849332809448, "num_tokens": 868000489.0, "step": 22749 }, { "epoch": 2.894033837934105, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.80765151977539, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8656672239303589, "num_tokens": 868034001.0, "step": 22750 }, { "epoch": 2.8941610482126956, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.71610641479492, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8666314482688904, "num_tokens": 868074956.0, "step": 22751 }, { "epoch": 2.894288258491286, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.29648208618164, "learning_rate": 1e-06, "loss": 0.6009, "mean_token_accuracy": 0.8701795339584351, "num_tokens": 868110727.0, "step": 22752 }, { "epoch": 2.8944154687698767, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.032894134521484, "learning_rate": 1e-06, "loss": 0.6949, "mean_token_accuracy": 0.8420149087905884, "num_tokens": 868151480.0, "step": 22753 }, { "epoch": 2.894542679048467, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.536563873291016, "learning_rate": 1e-06, "loss": 0.652, "mean_token_accuracy": 0.8538286685943604, "num_tokens": 868190620.0, "step": 22754 }, { "epoch": 2.8946698893270577, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.71683883666992, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8616399168968201, "num_tokens": 868230598.0, "step": 22755 }, { "epoch": 2.8947970996056482, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.73672103881836, "learning_rate": 1e-06, "loss": 0.666, "mean_token_accuracy": 0.8554986715316772, "num_tokens": 868265933.0, "step": 22756 }, { "epoch": 2.8949243098842388, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.394615173339844, "learning_rate": 1e-06, "loss": 0.6033, "mean_token_accuracy": 0.8662413358688354, "num_tokens": 868300282.0, "step": 22757 }, { "epoch": 2.8950515201628293, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.27295684814453, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8775003552436829, "num_tokens": 868331031.0, "step": 22758 }, { "epoch": 2.89517873044142, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.72415542602539, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8642641305923462, "num_tokens": 868368268.0, "step": 22759 }, { "epoch": 2.89530594072001, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.17959213256836, "learning_rate": 1e-06, "loss": 0.6802, "mean_token_accuracy": 0.8455792665481567, "num_tokens": 868407551.0, "step": 22760 }, { "epoch": 2.895433150998601, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.87482452392578, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8600872159004211, "num_tokens": 868449301.0, "step": 22761 }, { "epoch": 2.895560361277191, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.081787109375, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8598491549491882, "num_tokens": 868482853.0, "step": 22762 }, { "epoch": 2.895687571555782, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.96687316894531, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8580612540245056, "num_tokens": 868523960.0, "step": 22763 }, { "epoch": 2.895814781834372, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.27450180053711, "learning_rate": 1e-06, "loss": 0.6581, "mean_token_accuracy": 0.8556643128395081, "num_tokens": 868566447.0, "step": 22764 }, { "epoch": 2.895941992112963, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.283485412597656, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8730473518371582, "num_tokens": 868608342.0, "step": 22765 }, { "epoch": 2.896069202391553, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.18745040893555, "learning_rate": 1e-06, "loss": 0.6029, "mean_token_accuracy": 0.8731920123100281, "num_tokens": 868647354.0, "step": 22766 }, { "epoch": 2.8961964126701436, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.179412841796875, "learning_rate": 1e-06, "loss": 0.6924, "mean_token_accuracy": 0.8446067571640015, "num_tokens": 868690716.0, "step": 22767 }, { "epoch": 2.896323622948734, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.64796829223633, "learning_rate": 1e-06, "loss": 0.6065, "mean_token_accuracy": 0.8696036338806152, "num_tokens": 868728318.0, "step": 22768 }, { "epoch": 2.8964508332273247, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.12850570678711, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8774381875991821, "num_tokens": 868764323.0, "step": 22769 }, { "epoch": 2.896578043505915, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.38103485107422, "learning_rate": 1e-06, "loss": 0.6938, "mean_token_accuracy": 0.8448028564453125, "num_tokens": 868801371.0, "step": 22770 }, { "epoch": 2.8967052537845057, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.6655158996582, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8685081005096436, "num_tokens": 868846679.0, "step": 22771 }, { "epoch": 2.8968324640630962, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.95079803466797, "learning_rate": 1e-06, "loss": 0.6773, "mean_token_accuracy": 0.8493154048919678, "num_tokens": 868880076.0, "step": 22772 }, { "epoch": 2.8969596743416868, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.68692398071289, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8628665208816528, "num_tokens": 868917086.0, "step": 22773 }, { "epoch": 2.8970868846202773, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.244388580322266, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.868293821811676, "num_tokens": 868954952.0, "step": 22774 }, { "epoch": 2.897214094898868, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.321876525878906, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8584015369415283, "num_tokens": 868990911.0, "step": 22775 }, { "epoch": 2.8973413051774584, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.56032180786133, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8736448884010315, "num_tokens": 869033398.0, "step": 22776 }, { "epoch": 2.897468515456049, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.69190216064453, "learning_rate": 1e-06, "loss": 0.6657, "mean_token_accuracy": 0.8504324555397034, "num_tokens": 869070513.0, "step": 22777 }, { "epoch": 2.8975957257346394, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.98914337158203, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8703938722610474, "num_tokens": 869113494.0, "step": 22778 }, { "epoch": 2.89772293601323, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.43132781982422, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8724427223205566, "num_tokens": 869147433.0, "step": 22779 }, { "epoch": 2.8978501462918205, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.98673629760742, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8588037490844727, "num_tokens": 869186763.0, "step": 22780 }, { "epoch": 2.897977356570411, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.49162292480469, "learning_rate": 1e-06, "loss": 0.5802, "mean_token_accuracy": 0.876317024230957, "num_tokens": 869222624.0, "step": 22781 }, { "epoch": 2.8981045668490015, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.74630355834961, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8849289417266846, "num_tokens": 869263027.0, "step": 22782 }, { "epoch": 2.898231777127592, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.04497146606445, "learning_rate": 1e-06, "loss": 0.6369, "mean_token_accuracy": 0.8579683303833008, "num_tokens": 869304232.0, "step": 22783 }, { "epoch": 2.8983589874061826, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.0479850769043, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8682399988174438, "num_tokens": 869340141.0, "step": 22784 }, { "epoch": 2.8984861976847727, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.01222229003906, "learning_rate": 1e-06, "loss": 0.6476, "mean_token_accuracy": 0.8535003662109375, "num_tokens": 869382428.0, "step": 22785 }, { "epoch": 2.8986134079633636, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.244468688964844, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8702858686447144, "num_tokens": 869418890.0, "step": 22786 }, { "epoch": 2.8987406182419537, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.9898567199707, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.862653374671936, "num_tokens": 869453704.0, "step": 22787 }, { "epoch": 2.8988678285205447, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.99811553955078, "learning_rate": 1e-06, "loss": 0.6538, "mean_token_accuracy": 0.8584244251251221, "num_tokens": 869491845.0, "step": 22788 }, { "epoch": 2.898995038799135, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.12824249267578, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8747479319572449, "num_tokens": 869523393.0, "step": 22789 }, { "epoch": 2.8991222490777258, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.09423828125, "learning_rate": 1e-06, "loss": 0.6374, "mean_token_accuracy": 0.8604990243911743, "num_tokens": 869561927.0, "step": 22790 }, { "epoch": 2.899249459356316, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.984989166259766, "learning_rate": 1e-06, "loss": 0.6242, "mean_token_accuracy": 0.8620405197143555, "num_tokens": 869599568.0, "step": 22791 }, { "epoch": 2.8993766696349064, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.92953872680664, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8664224743843079, "num_tokens": 869636329.0, "step": 22792 }, { "epoch": 2.899503879913497, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.94084930419922, "learning_rate": 1e-06, "loss": 0.6444, "mean_token_accuracy": 0.8577455878257751, "num_tokens": 869679720.0, "step": 22793 }, { "epoch": 2.8996310901920874, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.13539123535156, "learning_rate": 1e-06, "loss": 0.6417, "mean_token_accuracy": 0.8654502630233765, "num_tokens": 869719527.0, "step": 22794 }, { "epoch": 2.899758300470678, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.864295959472656, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8676738739013672, "num_tokens": 869755139.0, "step": 22795 }, { "epoch": 2.8998855107492685, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.33243942260742, "learning_rate": 1e-06, "loss": 0.6425, "mean_token_accuracy": 0.8577520251274109, "num_tokens": 869791860.0, "step": 22796 }, { "epoch": 2.900012721027859, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.934837341308594, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8840773105621338, "num_tokens": 869827294.0, "step": 22797 }, { "epoch": 2.9001399313064495, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.62807846069336, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8577764630317688, "num_tokens": 869859398.0, "step": 22798 }, { "epoch": 2.90026714158504, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.37523651123047, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8636122345924377, "num_tokens": 869898215.0, "step": 22799 }, { "epoch": 2.9003943518636306, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.61495590209961, "learning_rate": 1e-06, "loss": 0.6592, "mean_token_accuracy": 0.8533743619918823, "num_tokens": 869939787.0, "step": 22800 }, { "epoch": 2.900521562142221, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.4566535949707, "learning_rate": 1e-06, "loss": 0.6708, "mean_token_accuracy": 0.8527747988700867, "num_tokens": 869976790.0, "step": 22801 }, { "epoch": 2.9006487724208116, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.39611053466797, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8677224516868591, "num_tokens": 870013547.0, "step": 22802 }, { "epoch": 2.900775982699402, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.385826110839844, "learning_rate": 1e-06, "loss": 0.5911, "mean_token_accuracy": 0.874902606010437, "num_tokens": 870049540.0, "step": 22803 }, { "epoch": 2.9009031929779927, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.195526123046875, "learning_rate": 1e-06, "loss": 0.6047, "mean_token_accuracy": 0.8715115189552307, "num_tokens": 870088247.0, "step": 22804 }, { "epoch": 2.9010304032565832, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.90262222290039, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8630169630050659, "num_tokens": 870129112.0, "step": 22805 }, { "epoch": 2.9011576135351738, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.26416015625, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.8634573221206665, "num_tokens": 870162082.0, "step": 22806 }, { "epoch": 2.9012848238137643, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.64917755126953, "learning_rate": 1e-06, "loss": 0.5981, "mean_token_accuracy": 0.8731434345245361, "num_tokens": 870199868.0, "step": 22807 }, { "epoch": 2.901412034092355, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.8049201965332, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8696005940437317, "num_tokens": 870238121.0, "step": 22808 }, { "epoch": 2.9015392443709453, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.81582260131836, "learning_rate": 1e-06, "loss": 0.6545, "mean_token_accuracy": 0.8547600507736206, "num_tokens": 870278008.0, "step": 22809 }, { "epoch": 2.9016664546495354, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.114681243896484, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8591763973236084, "num_tokens": 870317671.0, "step": 22810 }, { "epoch": 2.9017936649281264, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.59739303588867, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8686061501502991, "num_tokens": 870352893.0, "step": 22811 }, { "epoch": 2.9019208752067165, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.794288635253906, "learning_rate": 1e-06, "loss": 0.6055, "mean_token_accuracy": 0.8703666925430298, "num_tokens": 870387513.0, "step": 22812 }, { "epoch": 2.9020480854853075, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.55191421508789, "learning_rate": 1e-06, "loss": 0.6078, "mean_token_accuracy": 0.8695008754730225, "num_tokens": 870428649.0, "step": 22813 }, { "epoch": 2.9021752957638975, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.849300384521484, "learning_rate": 1e-06, "loss": 0.5814, "mean_token_accuracy": 0.8776462078094482, "num_tokens": 870468329.0, "step": 22814 }, { "epoch": 2.902302506042488, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.850643157958984, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8638235330581665, "num_tokens": 870509273.0, "step": 22815 }, { "epoch": 2.9024297163210786, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.80779266357422, "learning_rate": 1e-06, "loss": 0.6354, "mean_token_accuracy": 0.8583638668060303, "num_tokens": 870544206.0, "step": 22816 }, { "epoch": 2.902556926599669, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.506813049316406, "learning_rate": 1e-06, "loss": 0.5828, "mean_token_accuracy": 0.8785127401351929, "num_tokens": 870578708.0, "step": 22817 }, { "epoch": 2.9026841368782597, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.92156982421875, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8701175451278687, "num_tokens": 870620599.0, "step": 22818 }, { "epoch": 2.90281134715685, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.566871643066406, "learning_rate": 1e-06, "loss": 0.5794, "mean_token_accuracy": 0.8768624067306519, "num_tokens": 870657828.0, "step": 22819 }, { "epoch": 2.9029385574354407, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.49650192260742, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.8588045239448547, "num_tokens": 870694786.0, "step": 22820 }, { "epoch": 2.9030657677140312, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.41448211669922, "learning_rate": 1e-06, "loss": 0.5825, "mean_token_accuracy": 0.8814793825149536, "num_tokens": 870733850.0, "step": 22821 }, { "epoch": 2.9031929779926218, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.29341506958008, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8652815818786621, "num_tokens": 870771556.0, "step": 22822 }, { "epoch": 2.9033201882712123, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.92007064819336, "learning_rate": 1e-06, "loss": 0.5906, "mean_token_accuracy": 0.8736743927001953, "num_tokens": 870809335.0, "step": 22823 }, { "epoch": 2.903447398549803, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.44765090942383, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8592766523361206, "num_tokens": 870844239.0, "step": 22824 }, { "epoch": 2.9035746088283934, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.583961486816406, "learning_rate": 1e-06, "loss": 0.6297, "mean_token_accuracy": 0.8638857007026672, "num_tokens": 870879540.0, "step": 22825 }, { "epoch": 2.903701819106984, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.34967803955078, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8627805709838867, "num_tokens": 870917219.0, "step": 22826 }, { "epoch": 2.9038290293855744, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.94304656982422, "learning_rate": 1e-06, "loss": 0.6284, "mean_token_accuracy": 0.8633890151977539, "num_tokens": 870952332.0, "step": 22827 }, { "epoch": 2.903956239664165, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.036216735839844, "learning_rate": 1e-06, "loss": 0.6767, "mean_token_accuracy": 0.8517320156097412, "num_tokens": 870991609.0, "step": 22828 }, { "epoch": 2.9040834499427555, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.89414978027344, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8672475814819336, "num_tokens": 871027456.0, "step": 22829 }, { "epoch": 2.904210660221346, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.98813247680664, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8550617694854736, "num_tokens": 871071833.0, "step": 22830 }, { "epoch": 2.9043378704999365, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.735782623291016, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8621206283569336, "num_tokens": 871105446.0, "step": 22831 }, { "epoch": 2.904465080778527, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.925697326660156, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8613390922546387, "num_tokens": 871141408.0, "step": 22832 }, { "epoch": 2.9045922910571176, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.89244079589844, "learning_rate": 1e-06, "loss": 0.6235, "mean_token_accuracy": 0.8662667870521545, "num_tokens": 871173296.0, "step": 22833 }, { "epoch": 2.904719501335708, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.991432189941406, "learning_rate": 1e-06, "loss": 0.6495, "mean_token_accuracy": 0.8596319556236267, "num_tokens": 871210711.0, "step": 22834 }, { "epoch": 2.904846711614298, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.1591911315918, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.860188901424408, "num_tokens": 871251179.0, "step": 22835 }, { "epoch": 2.904973921892889, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.89485549926758, "learning_rate": 1e-06, "loss": 0.5661, "mean_token_accuracy": 0.883488655090332, "num_tokens": 871295758.0, "step": 22836 }, { "epoch": 2.9051011321714793, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.48418045043945, "learning_rate": 1e-06, "loss": 0.6819, "mean_token_accuracy": 0.845789909362793, "num_tokens": 871331145.0, "step": 22837 }, { "epoch": 2.9052283424500702, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.118736267089844, "learning_rate": 1e-06, "loss": 0.7047, "mean_token_accuracy": 0.8380411863327026, "num_tokens": 871372819.0, "step": 22838 }, { "epoch": 2.9053555527286603, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.17375564575195, "learning_rate": 1e-06, "loss": 0.6458, "mean_token_accuracy": 0.8582659959793091, "num_tokens": 871414223.0, "step": 22839 }, { "epoch": 2.905482763007251, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.95320510864258, "learning_rate": 1e-06, "loss": 0.6121, "mean_token_accuracy": 0.8678069114685059, "num_tokens": 871455013.0, "step": 22840 }, { "epoch": 2.9056099732858414, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.33803176879883, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8630866408348083, "num_tokens": 871495462.0, "step": 22841 }, { "epoch": 2.905737183564432, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.282798767089844, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8658013343811035, "num_tokens": 871534999.0, "step": 22842 }, { "epoch": 2.9058643938430224, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.31572723388672, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8641644716262817, "num_tokens": 871569816.0, "step": 22843 }, { "epoch": 2.905991604121613, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.232826232910156, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.866169810295105, "num_tokens": 871610441.0, "step": 22844 }, { "epoch": 2.9061188144002035, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.286067962646484, "learning_rate": 1e-06, "loss": 0.6711, "mean_token_accuracy": 0.848666787147522, "num_tokens": 871649005.0, "step": 22845 }, { "epoch": 2.906246024678794, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.94144821166992, "learning_rate": 1e-06, "loss": 0.6426, "mean_token_accuracy": 0.8596720695495605, "num_tokens": 871689263.0, "step": 22846 }, { "epoch": 2.9063732349573845, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.9022102355957, "learning_rate": 1e-06, "loss": 0.6004, "mean_token_accuracy": 0.8733813762664795, "num_tokens": 871725440.0, "step": 22847 }, { "epoch": 2.906500445235975, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.49406814575195, "learning_rate": 1e-06, "loss": 0.6451, "mean_token_accuracy": 0.8608649969100952, "num_tokens": 871767839.0, "step": 22848 }, { "epoch": 2.9066276555145656, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.38450622558594, "learning_rate": 1e-06, "loss": 0.6075, "mean_token_accuracy": 0.8688205480575562, "num_tokens": 871801516.0, "step": 22849 }, { "epoch": 2.906754865793156, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.092098236083984, "learning_rate": 1e-06, "loss": 0.6469, "mean_token_accuracy": 0.8606138229370117, "num_tokens": 871840359.0, "step": 22850 }, { "epoch": 2.9068820760717466, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.512149810791016, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8683435320854187, "num_tokens": 871879396.0, "step": 22851 }, { "epoch": 2.907009286350337, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.491790771484375, "learning_rate": 1e-06, "loss": 0.6572, "mean_token_accuracy": 0.8563185930252075, "num_tokens": 871915043.0, "step": 22852 }, { "epoch": 2.9071364966289277, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.53096008300781, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8578596711158752, "num_tokens": 871953391.0, "step": 22853 }, { "epoch": 2.9072637069075182, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.39977264404297, "learning_rate": 1e-06, "loss": 0.6149, "mean_token_accuracy": 0.8669540882110596, "num_tokens": 871989879.0, "step": 22854 }, { "epoch": 2.9073909171861088, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.7950553894043, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8658064603805542, "num_tokens": 872022428.0, "step": 22855 }, { "epoch": 2.9075181274646993, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.525291442871094, "learning_rate": 1e-06, "loss": 0.6128, "mean_token_accuracy": 0.8679415583610535, "num_tokens": 872060644.0, "step": 22856 }, { "epoch": 2.90764533774329, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.104312896728516, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8605270385742188, "num_tokens": 872098230.0, "step": 22857 }, { "epoch": 2.90777254802188, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.000789642333984, "learning_rate": 1e-06, "loss": 0.6555, "mean_token_accuracy": 0.8532910346984863, "num_tokens": 872131217.0, "step": 22858 }, { "epoch": 2.907899758300471, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.526031494140625, "learning_rate": 1e-06, "loss": 0.5996, "mean_token_accuracy": 0.8714298009872437, "num_tokens": 872163912.0, "step": 22859 }, { "epoch": 2.908026968579061, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.80561065673828, "learning_rate": 1e-06, "loss": 0.7087, "mean_token_accuracy": 0.839678943157196, "num_tokens": 872200694.0, "step": 22860 }, { "epoch": 2.908154178857652, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.571041107177734, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8633983135223389, "num_tokens": 872241156.0, "step": 22861 }, { "epoch": 2.908281389136242, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.7371940612793, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8644827604293823, "num_tokens": 872273048.0, "step": 22862 }, { "epoch": 2.908408599414833, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.5248908996582, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8677724003791809, "num_tokens": 872312165.0, "step": 22863 }, { "epoch": 2.908535809693423, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.306793212890625, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8666656017303467, "num_tokens": 872352740.0, "step": 22864 }, { "epoch": 2.9086630199720136, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.667381286621094, "learning_rate": 1e-06, "loss": 0.6599, "mean_token_accuracy": 0.8493527770042419, "num_tokens": 872397800.0, "step": 22865 }, { "epoch": 2.908790230250604, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.14463424682617, "learning_rate": 1e-06, "loss": 0.5965, "mean_token_accuracy": 0.8761411905288696, "num_tokens": 872429307.0, "step": 22866 }, { "epoch": 2.9089174405291947, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.52858352661133, "learning_rate": 1e-06, "loss": 0.6314, "mean_token_accuracy": 0.8607608079910278, "num_tokens": 872465897.0, "step": 22867 }, { "epoch": 2.909044650807785, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.26646423339844, "learning_rate": 1e-06, "loss": 0.6559, "mean_token_accuracy": 0.8542991876602173, "num_tokens": 872503252.0, "step": 22868 }, { "epoch": 2.9091718610863757, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.54374694824219, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8680049180984497, "num_tokens": 872546276.0, "step": 22869 }, { "epoch": 2.9092990713649662, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.20679473876953, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.8630449771881104, "num_tokens": 872585690.0, "step": 22870 }, { "epoch": 2.9094262816435568, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.812015533447266, "learning_rate": 1e-06, "loss": 0.6617, "mean_token_accuracy": 0.8519601225852966, "num_tokens": 872621718.0, "step": 22871 }, { "epoch": 2.9095534919221473, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.87042999267578, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.867415189743042, "num_tokens": 872659349.0, "step": 22872 }, { "epoch": 2.909680702200738, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.061588287353516, "learning_rate": 1e-06, "loss": 0.6377, "mean_token_accuracy": 0.860417366027832, "num_tokens": 872704468.0, "step": 22873 }, { "epoch": 2.9098079124793284, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.86014175415039, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.863923192024231, "num_tokens": 872740003.0, "step": 22874 }, { "epoch": 2.909935122757919, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.26225280761719, "learning_rate": 1e-06, "loss": 0.7037, "mean_token_accuracy": 0.8419003486633301, "num_tokens": 872780362.0, "step": 22875 }, { "epoch": 2.9100623330365094, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.60807800292969, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.8718326091766357, "num_tokens": 872823719.0, "step": 22876 }, { "epoch": 2.9101895433151, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.305694580078125, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8669990301132202, "num_tokens": 872856276.0, "step": 22877 }, { "epoch": 2.9103167535936905, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.27254104614258, "learning_rate": 1e-06, "loss": 0.6546, "mean_token_accuracy": 0.8560749292373657, "num_tokens": 872893688.0, "step": 22878 }, { "epoch": 2.910443963872281, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.63071823120117, "learning_rate": 1e-06, "loss": 0.5616, "mean_token_accuracy": 0.8826120495796204, "num_tokens": 872931169.0, "step": 22879 }, { "epoch": 2.9105711741508715, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.24734115600586, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8596258759498596, "num_tokens": 872970836.0, "step": 22880 }, { "epoch": 2.910698384429462, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.054840087890625, "learning_rate": 1e-06, "loss": 0.5808, "mean_token_accuracy": 0.8779308199882507, "num_tokens": 873010879.0, "step": 22881 }, { "epoch": 2.9108255947080526, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.008277893066406, "learning_rate": 1e-06, "loss": 0.6252, "mean_token_accuracy": 0.8649334907531738, "num_tokens": 873058018.0, "step": 22882 }, { "epoch": 2.9109528049866427, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.106449127197266, "learning_rate": 1e-06, "loss": 0.6672, "mean_token_accuracy": 0.8536136150360107, "num_tokens": 873096259.0, "step": 22883 }, { "epoch": 2.9110800152652336, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.25386428833008, "learning_rate": 1e-06, "loss": 0.6672, "mean_token_accuracy": 0.8531177639961243, "num_tokens": 873131955.0, "step": 22884 }, { "epoch": 2.9112072255438237, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.008697509765625, "learning_rate": 1e-06, "loss": 0.6547, "mean_token_accuracy": 0.8591181635856628, "num_tokens": 873171024.0, "step": 22885 }, { "epoch": 2.9113344358224147, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.46351623535156, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8678333759307861, "num_tokens": 873210568.0, "step": 22886 }, { "epoch": 2.9114616461010048, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.378257751464844, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.866399884223938, "num_tokens": 873250025.0, "step": 22887 }, { "epoch": 2.9115888563795957, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.48600769042969, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8556177616119385, "num_tokens": 873285284.0, "step": 22888 }, { "epoch": 2.911716066658186, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.144229888916016, "learning_rate": 1e-06, "loss": 0.621, "mean_token_accuracy": 0.8655939102172852, "num_tokens": 873320102.0, "step": 22889 }, { "epoch": 2.9118432769367764, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.881351470947266, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.8668361306190491, "num_tokens": 873354668.0, "step": 22890 }, { "epoch": 2.911970487215367, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.55061340332031, "learning_rate": 1e-06, "loss": 0.6398, "mean_token_accuracy": 0.8607985377311707, "num_tokens": 873387415.0, "step": 22891 }, { "epoch": 2.9120976974939574, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.60573196411133, "learning_rate": 1e-06, "loss": 0.5708, "mean_token_accuracy": 0.8816361427307129, "num_tokens": 873428679.0, "step": 22892 }, { "epoch": 2.912224907772548, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.79826354980469, "learning_rate": 1e-06, "loss": 0.6163, "mean_token_accuracy": 0.8645941615104675, "num_tokens": 873466973.0, "step": 22893 }, { "epoch": 2.9123521180511385, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.05432891845703, "learning_rate": 1e-06, "loss": 0.5754, "mean_token_accuracy": 0.8782357573509216, "num_tokens": 873510957.0, "step": 22894 }, { "epoch": 2.912479328329729, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.923614501953125, "learning_rate": 1e-06, "loss": 0.6966, "mean_token_accuracy": 0.8414990305900574, "num_tokens": 873546456.0, "step": 22895 }, { "epoch": 2.9126065386083195, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.146934509277344, "learning_rate": 1e-06, "loss": 0.6755, "mean_token_accuracy": 0.8484731912612915, "num_tokens": 873591972.0, "step": 22896 }, { "epoch": 2.91273374888691, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.12310791015625, "learning_rate": 1e-06, "loss": 0.6339, "mean_token_accuracy": 0.8607412576675415, "num_tokens": 873629790.0, "step": 22897 }, { "epoch": 2.9128609591655006, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.169166564941406, "learning_rate": 1e-06, "loss": 0.6337, "mean_token_accuracy": 0.8650155067443848, "num_tokens": 873667419.0, "step": 22898 }, { "epoch": 2.912988169444091, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.500389099121094, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.873539924621582, "num_tokens": 873705834.0, "step": 22899 }, { "epoch": 2.9131153797226816, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.075927734375, "learning_rate": 1e-06, "loss": 0.6556, "mean_token_accuracy": 0.8541272282600403, "num_tokens": 873748859.0, "step": 22900 }, { "epoch": 2.913242590001272, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.80768966674805, "learning_rate": 1e-06, "loss": 0.6604, "mean_token_accuracy": 0.8543534874916077, "num_tokens": 873788068.0, "step": 22901 }, { "epoch": 2.9133698002798627, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.61079406738281, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8720808029174805, "num_tokens": 873827820.0, "step": 22902 }, { "epoch": 2.9134970105584532, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.4703483581543, "learning_rate": 1e-06, "loss": 0.6446, "mean_token_accuracy": 0.8607606887817383, "num_tokens": 873866271.0, "step": 22903 }, { "epoch": 2.9136242208370438, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.83957290649414, "learning_rate": 1e-06, "loss": 0.6436, "mean_token_accuracy": 0.8582396507263184, "num_tokens": 873907409.0, "step": 22904 }, { "epoch": 2.9137514311156343, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.94060516357422, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.8706395626068115, "num_tokens": 873947940.0, "step": 22905 }, { "epoch": 2.913878641394225, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.03607177734375, "learning_rate": 1e-06, "loss": 0.6715, "mean_token_accuracy": 0.8488335609436035, "num_tokens": 873990062.0, "step": 22906 }, { "epoch": 2.9140058516728153, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.01228713989258, "learning_rate": 1e-06, "loss": 0.5894, "mean_token_accuracy": 0.8751991987228394, "num_tokens": 874026660.0, "step": 22907 }, { "epoch": 2.9141330619514054, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.17137145996094, "learning_rate": 1e-06, "loss": 0.6272, "mean_token_accuracy": 0.8702406883239746, "num_tokens": 874064935.0, "step": 22908 }, { "epoch": 2.9142602722299964, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.147953033447266, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8636883497238159, "num_tokens": 874102807.0, "step": 22909 }, { "epoch": 2.9143874825085865, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.89482116699219, "learning_rate": 1e-06, "loss": 0.6292, "mean_token_accuracy": 0.8654135465621948, "num_tokens": 874139688.0, "step": 22910 }, { "epoch": 2.9145146927871775, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.32053756713867, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8695176839828491, "num_tokens": 874184505.0, "step": 22911 }, { "epoch": 2.9146419030657675, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.937110900878906, "learning_rate": 1e-06, "loss": 0.6261, "mean_token_accuracy": 0.8635610938072205, "num_tokens": 874223210.0, "step": 22912 }, { "epoch": 2.914769113344358, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.53917694091797, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8500205278396606, "num_tokens": 874263939.0, "step": 22913 }, { "epoch": 2.9148963236229486, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.72249984741211, "learning_rate": 1e-06, "loss": 0.6108, "mean_token_accuracy": 0.8707967400550842, "num_tokens": 874303946.0, "step": 22914 }, { "epoch": 2.915023533901539, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.77629852294922, "learning_rate": 1e-06, "loss": 0.6126, "mean_token_accuracy": 0.8661176562309265, "num_tokens": 874343701.0, "step": 22915 }, { "epoch": 2.9151507441801296, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.85694885253906, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8658232688903809, "num_tokens": 874380425.0, "step": 22916 }, { "epoch": 2.91527795445872, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.433048248291016, "learning_rate": 1e-06, "loss": 0.6301, "mean_token_accuracy": 0.8641470670700073, "num_tokens": 874417643.0, "step": 22917 }, { "epoch": 2.9154051647373107, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.4749870300293, "learning_rate": 1e-06, "loss": 0.6627, "mean_token_accuracy": 0.8558210730552673, "num_tokens": 874463939.0, "step": 22918 }, { "epoch": 2.9155323750159012, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.375099182128906, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8584198355674744, "num_tokens": 874503156.0, "step": 22919 }, { "epoch": 2.9156595852944918, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.376243591308594, "learning_rate": 1e-06, "loss": 0.6649, "mean_token_accuracy": 0.8522322177886963, "num_tokens": 874546754.0, "step": 22920 }, { "epoch": 2.9157867955730823, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.166290283203125, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8663576245307922, "num_tokens": 874590781.0, "step": 22921 }, { "epoch": 2.915914005851673, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.46824264526367, "learning_rate": 1e-06, "loss": 0.5959, "mean_token_accuracy": 0.8745943307876587, "num_tokens": 874629592.0, "step": 22922 }, { "epoch": 2.9160412161302633, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.30925369262695, "learning_rate": 1e-06, "loss": 0.6007, "mean_token_accuracy": 0.8705843687057495, "num_tokens": 874667372.0, "step": 22923 }, { "epoch": 2.916168426408854, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.124210357666016, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8590928316116333, "num_tokens": 874704687.0, "step": 22924 }, { "epoch": 2.9162956366874444, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.17213821411133, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8623268604278564, "num_tokens": 874740471.0, "step": 22925 }, { "epoch": 2.916422846966035, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.35139083862305, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8688939213752747, "num_tokens": 874780747.0, "step": 22926 }, { "epoch": 2.9165500572446255, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.34479904174805, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.868087887763977, "num_tokens": 874814204.0, "step": 22927 }, { "epoch": 2.916677267523216, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.780662536621094, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8641393184661865, "num_tokens": 874851872.0, "step": 22928 }, { "epoch": 2.9168044778018065, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.08872985839844, "learning_rate": 1e-06, "loss": 0.6628, "mean_token_accuracy": 0.8492332100868225, "num_tokens": 874884215.0, "step": 22929 }, { "epoch": 2.916931688080397, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.216373443603516, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8610546588897705, "num_tokens": 874919047.0, "step": 22930 }, { "epoch": 2.917058898358987, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.99235916137695, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8569437265396118, "num_tokens": 874956469.0, "step": 22931 }, { "epoch": 2.917186108637578, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.72140121459961, "learning_rate": 1e-06, "loss": 0.588, "mean_token_accuracy": 0.8752848505973816, "num_tokens": 874994633.0, "step": 22932 }, { "epoch": 2.917313318916168, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.016902923583984, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.876185953617096, "num_tokens": 875028530.0, "step": 22933 }, { "epoch": 2.917440529194759, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.83230972290039, "learning_rate": 1e-06, "loss": 0.648, "mean_token_accuracy": 0.8562721014022827, "num_tokens": 875059785.0, "step": 22934 }, { "epoch": 2.9175677394733492, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.700199127197266, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8604755997657776, "num_tokens": 875096057.0, "step": 22935 }, { "epoch": 2.91769494975194, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.93379592895508, "learning_rate": 1e-06, "loss": 0.6031, "mean_token_accuracy": 0.8694594502449036, "num_tokens": 875141988.0, "step": 22936 }, { "epoch": 2.9178221600305303, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.17813491821289, "learning_rate": 1e-06, "loss": 0.6375, "mean_token_accuracy": 0.8687303066253662, "num_tokens": 875176973.0, "step": 22937 }, { "epoch": 2.917949370309121, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.41908264160156, "learning_rate": 1e-06, "loss": 0.6019, "mean_token_accuracy": 0.8716557025909424, "num_tokens": 875209271.0, "step": 22938 }, { "epoch": 2.9180765805877114, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.62084197998047, "learning_rate": 1e-06, "loss": 0.6132, "mean_token_accuracy": 0.8687306642532349, "num_tokens": 875247460.0, "step": 22939 }, { "epoch": 2.918203790866302, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.16132354736328, "learning_rate": 1e-06, "loss": 0.6254, "mean_token_accuracy": 0.8616248369216919, "num_tokens": 875286013.0, "step": 22940 }, { "epoch": 2.9183310011448924, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.907379150390625, "learning_rate": 1e-06, "loss": 0.6478, "mean_token_accuracy": 0.8591104745864868, "num_tokens": 875324547.0, "step": 22941 }, { "epoch": 2.918458211423483, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.901939392089844, "learning_rate": 1e-06, "loss": 0.641, "mean_token_accuracy": 0.8559858798980713, "num_tokens": 875366655.0, "step": 22942 }, { "epoch": 2.9185854217020735, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.680206298828125, "learning_rate": 1e-06, "loss": 0.6589, "mean_token_accuracy": 0.8590971231460571, "num_tokens": 875408571.0, "step": 22943 }, { "epoch": 2.918712631980664, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.953189849853516, "learning_rate": 1e-06, "loss": 0.6484, "mean_token_accuracy": 0.8575019836425781, "num_tokens": 875450730.0, "step": 22944 }, { "epoch": 2.9188398422592545, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.39877700805664, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8657742738723755, "num_tokens": 875486850.0, "step": 22945 }, { "epoch": 2.918967052537845, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.86509704589844, "learning_rate": 1e-06, "loss": 0.6603, "mean_token_accuracy": 0.8573900461196899, "num_tokens": 875528653.0, "step": 22946 }, { "epoch": 2.9190942628164356, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.98936080932617, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8741521835327148, "num_tokens": 875564155.0, "step": 22947 }, { "epoch": 2.919221473095026, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.791961669921875, "learning_rate": 1e-06, "loss": 0.6234, "mean_token_accuracy": 0.863239049911499, "num_tokens": 875598548.0, "step": 22948 }, { "epoch": 2.9193486833736166, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.67948913574219, "learning_rate": 1e-06, "loss": 0.5777, "mean_token_accuracy": 0.8796072006225586, "num_tokens": 875634587.0, "step": 22949 }, { "epoch": 2.919475893652207, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.991275787353516, "learning_rate": 1e-06, "loss": 0.632, "mean_token_accuracy": 0.8628678321838379, "num_tokens": 875671956.0, "step": 22950 }, { "epoch": 2.9196031039307977, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.957122802734375, "learning_rate": 1e-06, "loss": 0.6146, "mean_token_accuracy": 0.8656874895095825, "num_tokens": 875706327.0, "step": 22951 }, { "epoch": 2.9197303142093882, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.251312255859375, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8699349164962769, "num_tokens": 875749320.0, "step": 22952 }, { "epoch": 2.9198575244879788, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.170265197753906, "learning_rate": 1e-06, "loss": 0.6034, "mean_token_accuracy": 0.8732998371124268, "num_tokens": 875783871.0, "step": 22953 }, { "epoch": 2.9199847347665693, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.092872619628906, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8641322255134583, "num_tokens": 875821716.0, "step": 22954 }, { "epoch": 2.92011194504516, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.348297119140625, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.868065595626831, "num_tokens": 875854339.0, "step": 22955 }, { "epoch": 2.92023915532375, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.238136291503906, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8639911413192749, "num_tokens": 875889456.0, "step": 22956 }, { "epoch": 2.920366365602341, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.66334533691406, "learning_rate": 1e-06, "loss": 0.6591, "mean_token_accuracy": 0.855194091796875, "num_tokens": 875926659.0, "step": 22957 }, { "epoch": 2.920493575880931, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.0296516418457, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8732573986053467, "num_tokens": 875972766.0, "step": 22958 }, { "epoch": 2.920620786159522, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.37974548339844, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8596023321151733, "num_tokens": 876011824.0, "step": 22959 }, { "epoch": 2.920747996438112, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.189212799072266, "learning_rate": 1e-06, "loss": 0.697, "mean_token_accuracy": 0.8448057174682617, "num_tokens": 876050353.0, "step": 22960 }, { "epoch": 2.920875206716703, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.58294677734375, "learning_rate": 1e-06, "loss": 0.6737, "mean_token_accuracy": 0.8493918180465698, "num_tokens": 876089358.0, "step": 22961 }, { "epoch": 2.921002416995293, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.035972595214844, "learning_rate": 1e-06, "loss": 0.6731, "mean_token_accuracy": 0.8493350744247437, "num_tokens": 876126287.0, "step": 22962 }, { "epoch": 2.9211296272738836, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.519100189208984, "learning_rate": 1e-06, "loss": 0.6118, "mean_token_accuracy": 0.8746359348297119, "num_tokens": 876165157.0, "step": 22963 }, { "epoch": 2.921256837552474, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.161102294921875, "learning_rate": 1e-06, "loss": 0.6248, "mean_token_accuracy": 0.8636922240257263, "num_tokens": 876201391.0, "step": 22964 }, { "epoch": 2.9213840478310646, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.10961151123047, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8707288503646851, "num_tokens": 876243004.0, "step": 22965 }, { "epoch": 2.921511258109655, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.81165313720703, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8676528930664062, "num_tokens": 876279245.0, "step": 22966 }, { "epoch": 2.9216384683882457, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.81312561035156, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8619295358657837, "num_tokens": 876312145.0, "step": 22967 }, { "epoch": 2.9217656786668362, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.54652404785156, "learning_rate": 1e-06, "loss": 0.615, "mean_token_accuracy": 0.8659685850143433, "num_tokens": 876351354.0, "step": 22968 }, { "epoch": 2.9218928889454268, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.27907180786133, "learning_rate": 1e-06, "loss": 0.6751, "mean_token_accuracy": 0.852132260799408, "num_tokens": 876388621.0, "step": 22969 }, { "epoch": 2.9220200992240173, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.587608337402344, "learning_rate": 1e-06, "loss": 0.651, "mean_token_accuracy": 0.8565956950187683, "num_tokens": 876418377.0, "step": 22970 }, { "epoch": 2.922147309502608, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.05246353149414, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8669372797012329, "num_tokens": 876453351.0, "step": 22971 }, { "epoch": 2.9222745197811983, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.32371520996094, "learning_rate": 1e-06, "loss": 0.6042, "mean_token_accuracy": 0.8700190782546997, "num_tokens": 876484830.0, "step": 22972 }, { "epoch": 2.922401730059789, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.981788635253906, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8737848997116089, "num_tokens": 876524318.0, "step": 22973 }, { "epoch": 2.9225289403383794, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.037296295166016, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8611555099487305, "num_tokens": 876562421.0, "step": 22974 }, { "epoch": 2.92265615061697, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.490108489990234, "learning_rate": 1e-06, "loss": 0.6583, "mean_token_accuracy": 0.8573904633522034, "num_tokens": 876605681.0, "step": 22975 }, { "epoch": 2.9227833608955605, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.52033996582031, "learning_rate": 1e-06, "loss": 0.659, "mean_token_accuracy": 0.852241575717926, "num_tokens": 876647371.0, "step": 22976 }, { "epoch": 2.922910571174151, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.9161491394043, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.868513822555542, "num_tokens": 876685563.0, "step": 22977 }, { "epoch": 2.9230377814527415, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.64912414550781, "learning_rate": 1e-06, "loss": 0.6357, "mean_token_accuracy": 0.859714150428772, "num_tokens": 876718404.0, "step": 22978 }, { "epoch": 2.923164991731332, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.020660400390625, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.865848958492279, "num_tokens": 876751888.0, "step": 22979 }, { "epoch": 2.9232922020099226, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.55107116699219, "learning_rate": 1e-06, "loss": 0.6059, "mean_token_accuracy": 0.8681551218032837, "num_tokens": 876786813.0, "step": 22980 }, { "epoch": 2.9234194122885127, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.801273345947266, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8729256391525269, "num_tokens": 876824511.0, "step": 22981 }, { "epoch": 2.9235466225671036, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.83306884765625, "learning_rate": 1e-06, "loss": 0.6179, "mean_token_accuracy": 0.8656779527664185, "num_tokens": 876856409.0, "step": 22982 }, { "epoch": 2.9236738328456937, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.60105514526367, "learning_rate": 1e-06, "loss": 0.6738, "mean_token_accuracy": 0.8478889465332031, "num_tokens": 876889204.0, "step": 22983 }, { "epoch": 2.9238010431242847, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.04140853881836, "learning_rate": 1e-06, "loss": 0.6237, "mean_token_accuracy": 0.8651549816131592, "num_tokens": 876927185.0, "step": 22984 }, { "epoch": 2.9239282534028748, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.128108978271484, "learning_rate": 1e-06, "loss": 0.6407, "mean_token_accuracy": 0.8608629703521729, "num_tokens": 876965744.0, "step": 22985 }, { "epoch": 2.9240554636814657, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.694419860839844, "learning_rate": 1e-06, "loss": 0.6112, "mean_token_accuracy": 0.8701968193054199, "num_tokens": 876997107.0, "step": 22986 }, { "epoch": 2.924182673960056, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.98599624633789, "learning_rate": 1e-06, "loss": 0.6508, "mean_token_accuracy": 0.8526080846786499, "num_tokens": 877037894.0, "step": 22987 }, { "epoch": 2.9243098842386464, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.449806213378906, "learning_rate": 1e-06, "loss": 0.6852, "mean_token_accuracy": 0.8514485359191895, "num_tokens": 877078724.0, "step": 22988 }, { "epoch": 2.924437094517237, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.268348693847656, "learning_rate": 1e-06, "loss": 0.6251, "mean_token_accuracy": 0.8651316165924072, "num_tokens": 877119685.0, "step": 22989 }, { "epoch": 2.9245643047958274, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.26663589477539, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8665356636047363, "num_tokens": 877157055.0, "step": 22990 }, { "epoch": 2.924691515074418, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.11996078491211, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8681902885437012, "num_tokens": 877188569.0, "step": 22991 }, { "epoch": 2.9248187253530085, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.29252243041992, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8555408716201782, "num_tokens": 877222745.0, "step": 22992 }, { "epoch": 2.924945935631599, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.23099899291992, "learning_rate": 1e-06, "loss": 0.6796, "mean_token_accuracy": 0.8484135866165161, "num_tokens": 877262510.0, "step": 22993 }, { "epoch": 2.9250731459101895, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.09525680541992, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8557367920875549, "num_tokens": 877300164.0, "step": 22994 }, { "epoch": 2.92520035618878, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.19447708129883, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8738809823989868, "num_tokens": 877337187.0, "step": 22995 }, { "epoch": 2.9253275664673706, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.969757080078125, "learning_rate": 1e-06, "loss": 0.6867, "mean_token_accuracy": 0.8446295857429504, "num_tokens": 877375902.0, "step": 22996 }, { "epoch": 2.925454776745961, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.1574592590332, "learning_rate": 1e-06, "loss": 0.6325, "mean_token_accuracy": 0.8630141615867615, "num_tokens": 877414406.0, "step": 22997 }, { "epoch": 2.9255819870245516, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.85713577270508, "learning_rate": 1e-06, "loss": 0.5881, "mean_token_accuracy": 0.8698177337646484, "num_tokens": 877448977.0, "step": 22998 }, { "epoch": 2.925709197303142, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.297462463378906, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8704513311386108, "num_tokens": 877486766.0, "step": 22999 }, { "epoch": 2.9258364075817327, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.42823791503906, "learning_rate": 1e-06, "loss": 0.5791, "mean_token_accuracy": 0.8786360025405884, "num_tokens": 877519320.0, "step": 23000 }, { "epoch": 2.925963617860323, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.39314270019531, "learning_rate": 1e-06, "loss": 0.7092, "mean_token_accuracy": 0.838936984539032, "num_tokens": 877557913.0, "step": 23001 }, { "epoch": 2.9260908281389137, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.12842559814453, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8631306886672974, "num_tokens": 877591197.0, "step": 23002 }, { "epoch": 2.9262180384175043, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.71684265136719, "learning_rate": 1e-06, "loss": 0.5706, "mean_token_accuracy": 0.8803238868713379, "num_tokens": 877625018.0, "step": 23003 }, { "epoch": 2.926345248696095, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.529972076416016, "learning_rate": 1e-06, "loss": 0.6182, "mean_token_accuracy": 0.8681066036224365, "num_tokens": 877662433.0, "step": 23004 }, { "epoch": 2.9264724589746853, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.95917510986328, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.875736653804779, "num_tokens": 877703576.0, "step": 23005 }, { "epoch": 2.9265996692532754, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 54.0513801574707, "learning_rate": 1e-06, "loss": 0.6382, "mean_token_accuracy": 0.8576288223266602, "num_tokens": 877734025.0, "step": 23006 }, { "epoch": 2.9267268795318664, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.86738967895508, "learning_rate": 1e-06, "loss": 0.5831, "mean_token_accuracy": 0.8736646771430969, "num_tokens": 877771942.0, "step": 23007 }, { "epoch": 2.9268540898104565, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.97868728637695, "learning_rate": 1e-06, "loss": 0.626, "mean_token_accuracy": 0.8667645454406738, "num_tokens": 877815662.0, "step": 23008 }, { "epoch": 2.9269813000890474, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.91498565673828, "learning_rate": 1e-06, "loss": 0.6733, "mean_token_accuracy": 0.8479195237159729, "num_tokens": 877855398.0, "step": 23009 }, { "epoch": 2.9271085103676375, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.77035903930664, "learning_rate": 1e-06, "loss": 0.6513, "mean_token_accuracy": 0.8572279810905457, "num_tokens": 877898544.0, "step": 23010 }, { "epoch": 2.927235720646228, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.022865295410156, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.86265629529953, "num_tokens": 877937914.0, "step": 23011 }, { "epoch": 2.9273629309248186, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.701663970947266, "learning_rate": 1e-06, "loss": 0.688, "mean_token_accuracy": 0.8466030955314636, "num_tokens": 877979266.0, "step": 23012 }, { "epoch": 2.927490141203409, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.46256637573242, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8692022562026978, "num_tokens": 878019762.0, "step": 23013 }, { "epoch": 2.9276173514819996, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 54.19039535522461, "learning_rate": 1e-06, "loss": 0.6406, "mean_token_accuracy": 0.8575464487075806, "num_tokens": 878062174.0, "step": 23014 }, { "epoch": 2.92774456176059, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.851219177246094, "learning_rate": 1e-06, "loss": 0.6401, "mean_token_accuracy": 0.857429027557373, "num_tokens": 878098197.0, "step": 23015 }, { "epoch": 2.9278717720391807, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.95328140258789, "learning_rate": 1e-06, "loss": 0.6122, "mean_token_accuracy": 0.8687447309494019, "num_tokens": 878130478.0, "step": 23016 }, { "epoch": 2.9279989823177712, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.33321762084961, "learning_rate": 1e-06, "loss": 0.6732, "mean_token_accuracy": 0.8474404811859131, "num_tokens": 878175670.0, "step": 23017 }, { "epoch": 2.9281261925963618, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 54.00408935546875, "learning_rate": 1e-06, "loss": 0.6652, "mean_token_accuracy": 0.8519514799118042, "num_tokens": 878216415.0, "step": 23018 }, { "epoch": 2.9282534028749523, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.53501892089844, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8631263971328735, "num_tokens": 878249582.0, "step": 23019 }, { "epoch": 2.928380613153543, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.348236083984375, "learning_rate": 1e-06, "loss": 0.5852, "mean_token_accuracy": 0.8732926845550537, "num_tokens": 878290859.0, "step": 23020 }, { "epoch": 2.9285078234321333, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.9207649230957, "learning_rate": 1e-06, "loss": 0.5753, "mean_token_accuracy": 0.878587007522583, "num_tokens": 878322463.0, "step": 23021 }, { "epoch": 2.928635033710724, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.55919647216797, "learning_rate": 1e-06, "loss": 0.5867, "mean_token_accuracy": 0.8730859756469727, "num_tokens": 878365754.0, "step": 23022 }, { "epoch": 2.9287622439893144, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.48843002319336, "learning_rate": 1e-06, "loss": 0.5858, "mean_token_accuracy": 0.8767181038856506, "num_tokens": 878410325.0, "step": 23023 }, { "epoch": 2.928889454267905, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.5445671081543, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8698819875717163, "num_tokens": 878451604.0, "step": 23024 }, { "epoch": 2.9290166645464955, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 54.1358528137207, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8671388030052185, "num_tokens": 878485174.0, "step": 23025 }, { "epoch": 2.929143874825086, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.578269958496094, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.8560710549354553, "num_tokens": 878525867.0, "step": 23026 }, { "epoch": 2.9292710851036765, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.95551300048828, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.8518159985542297, "num_tokens": 878561327.0, "step": 23027 }, { "epoch": 2.929398295382267, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.517822265625, "learning_rate": 1e-06, "loss": 0.6113, "mean_token_accuracy": 0.8689309358596802, "num_tokens": 878603045.0, "step": 23028 }, { "epoch": 2.929525505660857, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.89253234863281, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8697535991668701, "num_tokens": 878645051.0, "step": 23029 }, { "epoch": 2.929652715939448, "ewc_loss": 0.2158203125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019168853759765625, "grad_norm": 52.448890686035156, "learning_rate": 1e-06, "loss": 0.6793, "mean_token_accuracy": 0.846194863319397, "num_tokens": 878682003.0, "step": 23030 }, { "epoch": 2.929779926218038, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.61504364013672, "learning_rate": 1e-06, "loss": 0.6158, "mean_token_accuracy": 0.8676825761795044, "num_tokens": 878720509.0, "step": 23031 }, { "epoch": 2.929907136496629, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 53.37467575073242, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8703205585479736, "num_tokens": 878757950.0, "step": 23032 }, { "epoch": 2.9300343467752192, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.6000862121582, "learning_rate": 1e-06, "loss": 0.6561, "mean_token_accuracy": 0.8575727343559265, "num_tokens": 878801477.0, "step": 23033 }, { "epoch": 2.93016155705381, "ewc_loss": 0.21875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000194549560546875, "grad_norm": 52.792388916015625, "learning_rate": 1e-06, "loss": 0.647, "mean_token_accuracy": 0.8555898666381836, "num_tokens": 878838802.0, "step": 23034 }, { "epoch": 2.9302887673324003, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 54.08051681518555, "learning_rate": 1e-06, "loss": 0.649, "mean_token_accuracy": 0.8575870394706726, "num_tokens": 878875566.0, "step": 23035 }, { "epoch": 2.930415977610991, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.88566589355469, "learning_rate": 1e-06, "loss": 0.6537, "mean_token_accuracy": 0.8551186323165894, "num_tokens": 878913463.0, "step": 23036 }, { "epoch": 2.9305431878895813, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.90296173095703, "learning_rate": 1e-06, "loss": 0.6708, "mean_token_accuracy": 0.8505160212516785, "num_tokens": 878950769.0, "step": 23037 }, { "epoch": 2.930670398168172, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.93688201904297, "learning_rate": 1e-06, "loss": 0.6011, "mean_token_accuracy": 0.8732360601425171, "num_tokens": 878989041.0, "step": 23038 }, { "epoch": 2.9307976084467624, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.39806365966797, "learning_rate": 1e-06, "loss": 0.6094, "mean_token_accuracy": 0.8719263076782227, "num_tokens": 879020195.0, "step": 23039 }, { "epoch": 2.930924818725353, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.072715759277344, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8606865406036377, "num_tokens": 879057818.0, "step": 23040 }, { "epoch": 2.9310520290039435, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.447635650634766, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8591344356536865, "num_tokens": 879094747.0, "step": 23041 }, { "epoch": 2.931179239282534, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.38719940185547, "learning_rate": 1e-06, "loss": 0.6601, "mean_token_accuracy": 0.85523521900177, "num_tokens": 879131127.0, "step": 23042 }, { "epoch": 2.9313064495611245, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.16912078857422, "learning_rate": 1e-06, "loss": 0.6174, "mean_token_accuracy": 0.8627691864967346, "num_tokens": 879168541.0, "step": 23043 }, { "epoch": 2.931433659839715, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.59202575683594, "learning_rate": 1e-06, "loss": 0.6478, "mean_token_accuracy": 0.8588908314704895, "num_tokens": 879199409.0, "step": 23044 }, { "epoch": 2.9315608701183056, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.26008987426758, "learning_rate": 1e-06, "loss": 0.584, "mean_token_accuracy": 0.8771404027938843, "num_tokens": 879236447.0, "step": 23045 }, { "epoch": 2.931688080396896, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.21922302246094, "learning_rate": 1e-06, "loss": 0.6249, "mean_token_accuracy": 0.8637593984603882, "num_tokens": 879275596.0, "step": 23046 }, { "epoch": 2.9318152906754866, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.86874008178711, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8630414009094238, "num_tokens": 879315620.0, "step": 23047 }, { "epoch": 2.931942500954077, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.27386474609375, "learning_rate": 1e-06, "loss": 0.6188, "mean_token_accuracy": 0.8650302886962891, "num_tokens": 879356220.0, "step": 23048 }, { "epoch": 2.9320697112326677, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.06420135498047, "learning_rate": 1e-06, "loss": 0.6855, "mean_token_accuracy": 0.8468418121337891, "num_tokens": 879397544.0, "step": 23049 }, { "epoch": 2.932196921511258, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.887210845947266, "learning_rate": 1e-06, "loss": 0.603, "mean_token_accuracy": 0.8740140199661255, "num_tokens": 879438165.0, "step": 23050 }, { "epoch": 2.9323241317898487, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.670684814453125, "learning_rate": 1e-06, "loss": 0.6198, "mean_token_accuracy": 0.8651795387268066, "num_tokens": 879473601.0, "step": 23051 }, { "epoch": 2.9324513420684393, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.41212463378906, "learning_rate": 1e-06, "loss": 0.663, "mean_token_accuracy": 0.8570533394813538, "num_tokens": 879510998.0, "step": 23052 }, { "epoch": 2.93257855234703, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.37934112548828, "learning_rate": 1e-06, "loss": 0.6138, "mean_token_accuracy": 0.8679189085960388, "num_tokens": 879551469.0, "step": 23053 }, { "epoch": 2.93270576262562, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.20516586303711, "learning_rate": 1e-06, "loss": 0.6472, "mean_token_accuracy": 0.8550350069999695, "num_tokens": 879585620.0, "step": 23054 }, { "epoch": 2.932832972904211, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.60758590698242, "learning_rate": 1e-06, "loss": 0.5987, "mean_token_accuracy": 0.8723276853561401, "num_tokens": 879621749.0, "step": 23055 }, { "epoch": 2.932960183182801, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.621971130371094, "learning_rate": 1e-06, "loss": 0.6196, "mean_token_accuracy": 0.8656734228134155, "num_tokens": 879664529.0, "step": 23056 }, { "epoch": 2.933087393461392, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.53769302368164, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8700535297393799, "num_tokens": 879696293.0, "step": 23057 }, { "epoch": 2.933214603739982, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.35825729370117, "learning_rate": 1e-06, "loss": 0.6035, "mean_token_accuracy": 0.8683258295059204, "num_tokens": 879733562.0, "step": 23058 }, { "epoch": 2.933341814018573, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.55830001831055, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8612526655197144, "num_tokens": 879771740.0, "step": 23059 }, { "epoch": 2.933469024297163, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.2966423034668, "learning_rate": 1e-06, "loss": 0.6433, "mean_token_accuracy": 0.8601245880126953, "num_tokens": 879805578.0, "step": 23060 }, { "epoch": 2.9335962345757536, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.07456970214844, "learning_rate": 1e-06, "loss": 0.6374, "mean_token_accuracy": 0.8610886335372925, "num_tokens": 879845175.0, "step": 23061 }, { "epoch": 2.933723444854344, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.35652542114258, "learning_rate": 1e-06, "loss": 0.583, "mean_token_accuracy": 0.8758487701416016, "num_tokens": 879878975.0, "step": 23062 }, { "epoch": 2.9338506551329346, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.05282974243164, "learning_rate": 1e-06, "loss": 0.6573, "mean_token_accuracy": 0.8560115098953247, "num_tokens": 879919373.0, "step": 23063 }, { "epoch": 2.933977865411525, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.54212188720703, "learning_rate": 1e-06, "loss": 0.6822, "mean_token_accuracy": 0.8437274694442749, "num_tokens": 879953821.0, "step": 23064 }, { "epoch": 2.9341050756901157, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.95438003540039, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8540926575660706, "num_tokens": 879993505.0, "step": 23065 }, { "epoch": 2.934232285968706, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.25949478149414, "learning_rate": 1e-06, "loss": 0.6101, "mean_token_accuracy": 0.8672723174095154, "num_tokens": 880030124.0, "step": 23066 }, { "epoch": 2.9343594962472968, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.299678802490234, "learning_rate": 1e-06, "loss": 0.7013, "mean_token_accuracy": 0.8391974568367004, "num_tokens": 880059480.0, "step": 23067 }, { "epoch": 2.9344867065258873, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.615657806396484, "learning_rate": 1e-06, "loss": 0.6192, "mean_token_accuracy": 0.8638908863067627, "num_tokens": 880093402.0, "step": 23068 }, { "epoch": 2.934613916804478, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.08201217651367, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8603552579879761, "num_tokens": 880127874.0, "step": 23069 }, { "epoch": 2.9347411270830683, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.79570388793945, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.873650074005127, "num_tokens": 880165755.0, "step": 23070 }, { "epoch": 2.934868337361659, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.27489471435547, "learning_rate": 1e-06, "loss": 0.6492, "mean_token_accuracy": 0.8571597933769226, "num_tokens": 880203931.0, "step": 23071 }, { "epoch": 2.9349955476402494, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.816810607910156, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8668820858001709, "num_tokens": 880242844.0, "step": 23072 }, { "epoch": 2.93512275791884, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.17898178100586, "learning_rate": 1e-06, "loss": 0.6696, "mean_token_accuracy": 0.8479551076889038, "num_tokens": 880280068.0, "step": 23073 }, { "epoch": 2.9352499681974304, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.394935607910156, "learning_rate": 1e-06, "loss": 0.6129, "mean_token_accuracy": 0.8707735538482666, "num_tokens": 880314044.0, "step": 23074 }, { "epoch": 2.935377178476021, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.335628509521484, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8616787791252136, "num_tokens": 880355077.0, "step": 23075 }, { "epoch": 2.9355043887546115, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.78277587890625, "learning_rate": 1e-06, "loss": 0.5788, "mean_token_accuracy": 0.8776450753211975, "num_tokens": 880388906.0, "step": 23076 }, { "epoch": 2.935631599033202, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.41445541381836, "learning_rate": 1e-06, "loss": 0.5932, "mean_token_accuracy": 0.875350832939148, "num_tokens": 880426834.0, "step": 23077 }, { "epoch": 2.9357588093117926, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.751346588134766, "learning_rate": 1e-06, "loss": 0.6286, "mean_token_accuracy": 0.8621174097061157, "num_tokens": 880463477.0, "step": 23078 }, { "epoch": 2.9358860195903826, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.2527961730957, "learning_rate": 1e-06, "loss": 0.6736, "mean_token_accuracy": 0.8464459180831909, "num_tokens": 880500884.0, "step": 23079 }, { "epoch": 2.9360132298689736, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.299198150634766, "learning_rate": 1e-06, "loss": 0.6387, "mean_token_accuracy": 0.858479380607605, "num_tokens": 880539724.0, "step": 23080 }, { "epoch": 2.9361404401475637, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.06958770751953, "learning_rate": 1e-06, "loss": 0.6389, "mean_token_accuracy": 0.8570486307144165, "num_tokens": 880577820.0, "step": 23081 }, { "epoch": 2.9362676504261547, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.835514068603516, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8678484559059143, "num_tokens": 880612055.0, "step": 23082 }, { "epoch": 2.9363948607047448, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.134361267089844, "learning_rate": 1e-06, "loss": 0.5986, "mean_token_accuracy": 0.8716015219688416, "num_tokens": 880654907.0, "step": 23083 }, { "epoch": 2.9365220709833357, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.71113586425781, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8787009716033936, "num_tokens": 880692418.0, "step": 23084 }, { "epoch": 2.936649281261926, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 52.72346496582031, "learning_rate": 1e-06, "loss": 0.6631, "mean_token_accuracy": 0.8483662009239197, "num_tokens": 880727742.0, "step": 23085 }, { "epoch": 2.9367764915405163, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.908348083496094, "learning_rate": 1e-06, "loss": 0.597, "mean_token_accuracy": 0.8760538101196289, "num_tokens": 880766673.0, "step": 23086 }, { "epoch": 2.936903701819107, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.798336029052734, "learning_rate": 1e-06, "loss": 0.6491, "mean_token_accuracy": 0.8548505902290344, "num_tokens": 880810390.0, "step": 23087 }, { "epoch": 2.9370309120976974, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.98181915283203, "learning_rate": 1e-06, "loss": 0.6452, "mean_token_accuracy": 0.8582967519760132, "num_tokens": 880846920.0, "step": 23088 }, { "epoch": 2.937158122376288, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.50082015991211, "learning_rate": 1e-06, "loss": 0.5951, "mean_token_accuracy": 0.8699295520782471, "num_tokens": 880885867.0, "step": 23089 }, { "epoch": 2.9372853326548785, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.95309066772461, "learning_rate": 1e-06, "loss": 0.6115, "mean_token_accuracy": 0.8745071887969971, "num_tokens": 880928408.0, "step": 23090 }, { "epoch": 2.937412542933469, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.51395797729492, "learning_rate": 1e-06, "loss": 0.6233, "mean_token_accuracy": 0.8630788326263428, "num_tokens": 880960681.0, "step": 23091 }, { "epoch": 2.9375397532120595, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.881771087646484, "learning_rate": 1e-06, "loss": 0.6619, "mean_token_accuracy": 0.8532775640487671, "num_tokens": 881000358.0, "step": 23092 }, { "epoch": 2.93766696349065, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 52.507789611816406, "learning_rate": 1e-06, "loss": 0.6203, "mean_token_accuracy": 0.8653327822685242, "num_tokens": 881037765.0, "step": 23093 }, { "epoch": 2.9377941737692406, "ewc_loss": 0.236328125, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 65.33464813232422, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8769652843475342, "num_tokens": 881076091.0, "step": 23094 }, { "epoch": 2.937921384047831, "ewc_loss": 0.2138671875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00018978118896484375, "grad_norm": 51.70446014404297, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8622272610664368, "num_tokens": 881112784.0, "step": 23095 }, { "epoch": 2.9380485943264216, "ewc_loss": 0.2578125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002346038818359375, "grad_norm": 57.50887680053711, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8698381185531616, "num_tokens": 881149938.0, "step": 23096 }, { "epoch": 2.938175804605012, "ewc_loss": 0.2119140625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00018787384033203125, "grad_norm": 51.648345947265625, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.8544765710830688, "num_tokens": 881187541.0, "step": 23097 }, { "epoch": 2.9383030148836027, "ewc_loss": 0.2578125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00023365020751953125, "grad_norm": 57.231815338134766, "learning_rate": 1e-06, "loss": 0.649, "mean_token_accuracy": 0.8652169704437256, "num_tokens": 881221920.0, "step": 23098 }, { "epoch": 2.938430225162193, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.635215759277344, "learning_rate": 1e-06, "loss": 0.665, "mean_token_accuracy": 0.8489673137664795, "num_tokens": 881264804.0, "step": 23099 }, { "epoch": 2.9385574354407837, "ewc_loss": 0.2451171875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00022029876708984375, "grad_norm": 56.22740936279297, "learning_rate": 1e-06, "loss": 0.6639, "mean_token_accuracy": 0.8565092086791992, "num_tokens": 881304482.0, "step": 23100 }, { "epoch": 2.9386846457193743, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.594390869140625, "learning_rate": 1e-06, "loss": 0.6587, "mean_token_accuracy": 0.8541382551193237, "num_tokens": 881340822.0, "step": 23101 }, { "epoch": 2.938811855997965, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 54.94424057006836, "learning_rate": 1e-06, "loss": 0.6544, "mean_token_accuracy": 0.860913872718811, "num_tokens": 881374352.0, "step": 23102 }, { "epoch": 2.9389390662765553, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.654205322265625, "learning_rate": 1e-06, "loss": 0.6438, "mean_token_accuracy": 0.857351541519165, "num_tokens": 881413176.0, "step": 23103 }, { "epoch": 2.9390662765551454, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.10943603515625, "learning_rate": 1e-06, "loss": 0.6547, "mean_token_accuracy": 0.8574870824813843, "num_tokens": 881454309.0, "step": 23104 }, { "epoch": 2.9391934868337364, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.043601989746094, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8643666505813599, "num_tokens": 881490702.0, "step": 23105 }, { "epoch": 2.9393206971123265, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.72380447387695, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8705772757530212, "num_tokens": 881530394.0, "step": 23106 }, { "epoch": 2.9394479073909174, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.052589416503906, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8755857348442078, "num_tokens": 881567286.0, "step": 23107 }, { "epoch": 2.9395751176695075, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.341243743896484, "learning_rate": 1e-06, "loss": 0.6664, "mean_token_accuracy": 0.8581812381744385, "num_tokens": 881607017.0, "step": 23108 }, { "epoch": 2.939702327948098, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.780548095703125, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8629709482192993, "num_tokens": 881645793.0, "step": 23109 }, { "epoch": 2.9398295382266886, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.13504409790039, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8660940527915955, "num_tokens": 881683596.0, "step": 23110 }, { "epoch": 2.939956748505279, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.55753707885742, "learning_rate": 1e-06, "loss": 0.6839, "mean_token_accuracy": 0.8466994166374207, "num_tokens": 881717607.0, "step": 23111 }, { "epoch": 2.9400839587838696, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.39542007446289, "learning_rate": 1e-06, "loss": 0.5875, "mean_token_accuracy": 0.8765469789505005, "num_tokens": 881750884.0, "step": 23112 }, { "epoch": 2.94021116906246, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.332340240478516, "learning_rate": 1e-06, "loss": 0.6519, "mean_token_accuracy": 0.858778178691864, "num_tokens": 881789345.0, "step": 23113 }, { "epoch": 2.9403383793410507, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.62610626220703, "learning_rate": 1e-06, "loss": 0.6467, "mean_token_accuracy": 0.8568647503852844, "num_tokens": 881828459.0, "step": 23114 }, { "epoch": 2.940465589619641, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.45152282714844, "learning_rate": 1e-06, "loss": 0.6552, "mean_token_accuracy": 0.8573831915855408, "num_tokens": 881865294.0, "step": 23115 }, { "epoch": 2.9405927998982317, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.468631744384766, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8658599853515625, "num_tokens": 881904936.0, "step": 23116 }, { "epoch": 2.9407200101768223, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.44841766357422, "learning_rate": 1e-06, "loss": 0.5958, "mean_token_accuracy": 0.8739527463912964, "num_tokens": 881936300.0, "step": 23117 }, { "epoch": 2.940847220455413, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.62003707885742, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8740137815475464, "num_tokens": 881974092.0, "step": 23118 }, { "epoch": 2.9409744307340033, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.132049560546875, "learning_rate": 1e-06, "loss": 0.6322, "mean_token_accuracy": 0.8635156154632568, "num_tokens": 882019339.0, "step": 23119 }, { "epoch": 2.941101641012594, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.42918014526367, "learning_rate": 1e-06, "loss": 0.6223, "mean_token_accuracy": 0.8658466935157776, "num_tokens": 882052837.0, "step": 23120 }, { "epoch": 2.9412288512911844, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.89022445678711, "learning_rate": 1e-06, "loss": 0.6061, "mean_token_accuracy": 0.8670250773429871, "num_tokens": 882088449.0, "step": 23121 }, { "epoch": 2.941356061569775, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.41394805908203, "learning_rate": 1e-06, "loss": 0.6523, "mean_token_accuracy": 0.8545286059379578, "num_tokens": 882124844.0, "step": 23122 }, { "epoch": 2.9414832718483654, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.274837493896484, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.8661110401153564, "num_tokens": 882162246.0, "step": 23123 }, { "epoch": 2.941610482126956, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.775230407714844, "learning_rate": 1e-06, "loss": 0.7104, "mean_token_accuracy": 0.8529281616210938, "num_tokens": 882202825.0, "step": 23124 }, { "epoch": 2.9417376924055465, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.81538772583008, "learning_rate": 1e-06, "loss": 0.6673, "mean_token_accuracy": 0.8499876856803894, "num_tokens": 882237951.0, "step": 23125 }, { "epoch": 2.941864902684137, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.581268310546875, "learning_rate": 1e-06, "loss": 0.6397, "mean_token_accuracy": 0.8606959581375122, "num_tokens": 882279147.0, "step": 23126 }, { "epoch": 2.941992112962727, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.613643646240234, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8857553005218506, "num_tokens": 882322777.0, "step": 23127 }, { "epoch": 2.942119323241318, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.98020553588867, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8668287992477417, "num_tokens": 882360681.0, "step": 23128 }, { "epoch": 2.942246533519908, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.63325881958008, "learning_rate": 1e-06, "loss": 0.6875, "mean_token_accuracy": 0.8427838087081909, "num_tokens": 882403823.0, "step": 23129 }, { "epoch": 2.942373743798499, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.746097564697266, "learning_rate": 1e-06, "loss": 0.6159, "mean_token_accuracy": 0.8716286420822144, "num_tokens": 882443723.0, "step": 23130 }, { "epoch": 2.9425009540770892, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.996673583984375, "learning_rate": 1e-06, "loss": 0.6828, "mean_token_accuracy": 0.8485848307609558, "num_tokens": 882480610.0, "step": 23131 }, { "epoch": 2.94262816435568, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.92938995361328, "learning_rate": 1e-06, "loss": 0.6281, "mean_token_accuracy": 0.8642067909240723, "num_tokens": 882518789.0, "step": 23132 }, { "epoch": 2.9427553746342703, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.01875686645508, "learning_rate": 1e-06, "loss": 0.6735, "mean_token_accuracy": 0.849693775177002, "num_tokens": 882556598.0, "step": 23133 }, { "epoch": 2.942882584912861, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.1881217956543, "learning_rate": 1e-06, "loss": 0.653, "mean_token_accuracy": 0.8575359582901001, "num_tokens": 882593788.0, "step": 23134 }, { "epoch": 2.9430097951914513, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.96412658691406, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8700705766677856, "num_tokens": 882630861.0, "step": 23135 }, { "epoch": 2.943137005470042, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.13505172729492, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8708945512771606, "num_tokens": 882670950.0, "step": 23136 }, { "epoch": 2.9432642157486324, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.21553039550781, "learning_rate": 1e-06, "loss": 0.6653, "mean_token_accuracy": 0.8517662286758423, "num_tokens": 882705680.0, "step": 23137 }, { "epoch": 2.943391426027223, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.535926818847656, "learning_rate": 1e-06, "loss": 0.6069, "mean_token_accuracy": 0.872544527053833, "num_tokens": 882737589.0, "step": 23138 }, { "epoch": 2.9435186363058135, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.5234375, "learning_rate": 1e-06, "loss": 0.68, "mean_token_accuracy": 0.8477274179458618, "num_tokens": 882774502.0, "step": 23139 }, { "epoch": 2.943645846584404, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.379878997802734, "learning_rate": 1e-06, "loss": 0.659, "mean_token_accuracy": 0.8534519672393799, "num_tokens": 882815184.0, "step": 23140 }, { "epoch": 2.9437730568629945, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.61288070678711, "learning_rate": 1e-06, "loss": 0.5941, "mean_token_accuracy": 0.8743236660957336, "num_tokens": 882856793.0, "step": 23141 }, { "epoch": 2.943900267141585, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.15980529785156, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8737394213676453, "num_tokens": 882896128.0, "step": 23142 }, { "epoch": 2.9440274774201756, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.933067321777344, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8709911704063416, "num_tokens": 882931253.0, "step": 23143 }, { "epoch": 2.944154687698766, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.87660217285156, "learning_rate": 1e-06, "loss": 0.5722, "mean_token_accuracy": 0.8792254328727722, "num_tokens": 882975072.0, "step": 23144 }, { "epoch": 2.9442818979773566, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.358402252197266, "learning_rate": 1e-06, "loss": 0.5979, "mean_token_accuracy": 0.874687135219574, "num_tokens": 883013576.0, "step": 23145 }, { "epoch": 2.944409108255947, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.96508026123047, "learning_rate": 1e-06, "loss": 0.6114, "mean_token_accuracy": 0.8650988340377808, "num_tokens": 883046052.0, "step": 23146 }, { "epoch": 2.9445363185345377, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.15595626831055, "learning_rate": 1e-06, "loss": 0.7022, "mean_token_accuracy": 0.8416388630867004, "num_tokens": 883080398.0, "step": 23147 }, { "epoch": 2.944663528813128, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 52.71399688720703, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.8539700508117676, "num_tokens": 883120321.0, "step": 23148 }, { "epoch": 2.9447907390917187, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.89626693725586, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.869692325592041, "num_tokens": 883161285.0, "step": 23149 }, { "epoch": 2.9449179493703093, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.82968521118164, "learning_rate": 1e-06, "loss": 0.625, "mean_token_accuracy": 0.8632385730743408, "num_tokens": 883199857.0, "step": 23150 }, { "epoch": 2.9450451596489, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.84873580932617, "learning_rate": 1e-06, "loss": 0.6147, "mean_token_accuracy": 0.8690333366394043, "num_tokens": 883238325.0, "step": 23151 }, { "epoch": 2.94517236992749, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.370643615722656, "learning_rate": 1e-06, "loss": 0.6534, "mean_token_accuracy": 0.8536337018013, "num_tokens": 883281814.0, "step": 23152 }, { "epoch": 2.945299580206081, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.754486083984375, "learning_rate": 1e-06, "loss": 0.6119, "mean_token_accuracy": 0.8672884702682495, "num_tokens": 883317385.0, "step": 23153 }, { "epoch": 2.945426790484671, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.17924880981445, "learning_rate": 1e-06, "loss": 0.6021, "mean_token_accuracy": 0.8716070652008057, "num_tokens": 883353498.0, "step": 23154 }, { "epoch": 2.945554000763262, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.47684860229492, "learning_rate": 1e-06, "loss": 0.614, "mean_token_accuracy": 0.8694267272949219, "num_tokens": 883391709.0, "step": 23155 }, { "epoch": 2.945681211041852, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.99817657470703, "learning_rate": 1e-06, "loss": 0.6788, "mean_token_accuracy": 0.8483643531799316, "num_tokens": 883426329.0, "step": 23156 }, { "epoch": 2.945808421320443, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.41010284423828, "learning_rate": 1e-06, "loss": 0.6399, "mean_token_accuracy": 0.8615750670433044, "num_tokens": 883465797.0, "step": 23157 }, { "epoch": 2.945935631599033, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.34513473510742, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8650084137916565, "num_tokens": 883501632.0, "step": 23158 }, { "epoch": 2.9460628418776236, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.88555145263672, "learning_rate": 1e-06, "loss": 0.599, "mean_token_accuracy": 0.8742550611495972, "num_tokens": 883537622.0, "step": 23159 }, { "epoch": 2.946190052156214, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.788352966308594, "learning_rate": 1e-06, "loss": 0.6262, "mean_token_accuracy": 0.8640696406364441, "num_tokens": 883576458.0, "step": 23160 }, { "epoch": 2.9463172624348046, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.59004592895508, "learning_rate": 1e-06, "loss": 0.6482, "mean_token_accuracy": 0.8529391288757324, "num_tokens": 883620929.0, "step": 23161 }, { "epoch": 2.946444472713395, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.49515914916992, "learning_rate": 1e-06, "loss": 0.7165, "mean_token_accuracy": 0.840289831161499, "num_tokens": 883652589.0, "step": 23162 }, { "epoch": 2.9465716829919857, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.113525390625, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8638488054275513, "num_tokens": 883691844.0, "step": 23163 }, { "epoch": 2.946698893270576, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 54.03373336791992, "learning_rate": 1e-06, "loss": 0.6979, "mean_token_accuracy": 0.8432651162147522, "num_tokens": 883732838.0, "step": 23164 }, { "epoch": 2.9468261035491667, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 52.67433547973633, "learning_rate": 1e-06, "loss": 0.6231, "mean_token_accuracy": 0.8614292740821838, "num_tokens": 883771415.0, "step": 23165 }, { "epoch": 2.9469533138277573, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.78993225097656, "learning_rate": 1e-06, "loss": 0.6853, "mean_token_accuracy": 0.8479295372962952, "num_tokens": 883808807.0, "step": 23166 }, { "epoch": 2.947080524106348, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.1944580078125, "learning_rate": 1e-06, "loss": 0.5709, "mean_token_accuracy": 0.8775941729545593, "num_tokens": 883844300.0, "step": 23167 }, { "epoch": 2.9472077343849383, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 54.152183532714844, "learning_rate": 1e-06, "loss": 0.5922, "mean_token_accuracy": 0.8778674602508545, "num_tokens": 883879337.0, "step": 23168 }, { "epoch": 2.947334944663529, "ewc_loss": 0.2197265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019550323486328125, "grad_norm": 53.20689392089844, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.85268235206604, "num_tokens": 883925810.0, "step": 23169 }, { "epoch": 2.9474621549421194, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.9136848449707, "learning_rate": 1e-06, "loss": 0.6145, "mean_token_accuracy": 0.8722504377365112, "num_tokens": 883964753.0, "step": 23170 }, { "epoch": 2.94758936522071, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.517879486083984, "learning_rate": 1e-06, "loss": 0.6605, "mean_token_accuracy": 0.8547895550727844, "num_tokens": 884005737.0, "step": 23171 }, { "epoch": 2.9477165754993004, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.72486877441406, "learning_rate": 1e-06, "loss": 0.6509, "mean_token_accuracy": 0.8638927936553955, "num_tokens": 884043414.0, "step": 23172 }, { "epoch": 2.947843785777891, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.54948425292969, "learning_rate": 1e-06, "loss": 0.6176, "mean_token_accuracy": 0.8684780597686768, "num_tokens": 884077399.0, "step": 23173 }, { "epoch": 2.9479709960564815, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.78407287597656, "learning_rate": 1e-06, "loss": 0.6353, "mean_token_accuracy": 0.8617217540740967, "num_tokens": 884116539.0, "step": 23174 }, { "epoch": 2.948098206335072, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.3831901550293, "learning_rate": 1e-06, "loss": 0.6084, "mean_token_accuracy": 0.8694663047790527, "num_tokens": 884149756.0, "step": 23175 }, { "epoch": 2.9482254166136626, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.853607177734375, "learning_rate": 1e-06, "loss": 0.58, "mean_token_accuracy": 0.883324146270752, "num_tokens": 884181960.0, "step": 23176 }, { "epoch": 2.9483526268922526, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.26492691040039, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8572834134101868, "num_tokens": 884223562.0, "step": 23177 }, { "epoch": 2.9484798371708436, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.60557556152344, "learning_rate": 1e-06, "loss": 0.6092, "mean_token_accuracy": 0.8679265975952148, "num_tokens": 884264541.0, "step": 23178 }, { "epoch": 2.9486070474494337, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.93415069580078, "learning_rate": 1e-06, "loss": 0.6735, "mean_token_accuracy": 0.8489217758178711, "num_tokens": 884301171.0, "step": 23179 }, { "epoch": 2.9487342577280247, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.698631286621094, "learning_rate": 1e-06, "loss": 0.6247, "mean_token_accuracy": 0.8610621690750122, "num_tokens": 884332481.0, "step": 23180 }, { "epoch": 2.9488614680066147, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.35783386230469, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8843812942504883, "num_tokens": 884371616.0, "step": 23181 }, { "epoch": 2.9489886782852053, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.351287841796875, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8522985577583313, "num_tokens": 884411245.0, "step": 23182 }, { "epoch": 2.949115888563796, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.2803955078125, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8608695864677429, "num_tokens": 884445712.0, "step": 23183 }, { "epoch": 2.9492430988423863, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.23942184448242, "learning_rate": 1e-06, "loss": 0.6109, "mean_token_accuracy": 0.8682492971420288, "num_tokens": 884484935.0, "step": 23184 }, { "epoch": 2.949370309120977, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.8582878112793, "learning_rate": 1e-06, "loss": 0.6241, "mean_token_accuracy": 0.8638107776641846, "num_tokens": 884518714.0, "step": 23185 }, { "epoch": 2.9494975193995674, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.64395523071289, "learning_rate": 1e-06, "loss": 0.6614, "mean_token_accuracy": 0.8564901351928711, "num_tokens": 884560430.0, "step": 23186 }, { "epoch": 2.949624729678158, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.48617172241211, "learning_rate": 1e-06, "loss": 0.6501, "mean_token_accuracy": 0.8551802635192871, "num_tokens": 884595937.0, "step": 23187 }, { "epoch": 2.9497519399567484, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.80303955078125, "learning_rate": 1e-06, "loss": 0.651, "mean_token_accuracy": 0.85817551612854, "num_tokens": 884633201.0, "step": 23188 }, { "epoch": 2.949879150235339, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.851505279541016, "learning_rate": 1e-06, "loss": 0.6225, "mean_token_accuracy": 0.865857720375061, "num_tokens": 884672602.0, "step": 23189 }, { "epoch": 2.9500063605139295, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.97507858276367, "learning_rate": 1e-06, "loss": 0.6141, "mean_token_accuracy": 0.8646528124809265, "num_tokens": 884708378.0, "step": 23190 }, { "epoch": 2.95013357079252, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.003543853759766, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8601168394088745, "num_tokens": 884747527.0, "step": 23191 }, { "epoch": 2.9502607810711106, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.17981719970703, "learning_rate": 1e-06, "loss": 0.5657, "mean_token_accuracy": 0.8831640481948853, "num_tokens": 884781738.0, "step": 23192 }, { "epoch": 2.950387991349701, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.14158248901367, "learning_rate": 1e-06, "loss": 0.6669, "mean_token_accuracy": 0.8498209714889526, "num_tokens": 884818884.0, "step": 23193 }, { "epoch": 2.9505152016282916, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.23298263549805, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8731343150138855, "num_tokens": 884851751.0, "step": 23194 }, { "epoch": 2.950642411906882, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.75859451293945, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.865472137928009, "num_tokens": 884892935.0, "step": 23195 }, { "epoch": 2.9507696221854727, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.83262252807617, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8687487244606018, "num_tokens": 884935820.0, "step": 23196 }, { "epoch": 2.950896832464063, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.619873046875, "learning_rate": 1e-06, "loss": 0.6455, "mean_token_accuracy": 0.8613234758377075, "num_tokens": 884976042.0, "step": 23197 }, { "epoch": 2.9510240427426537, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.527645111083984, "learning_rate": 1e-06, "loss": 0.5633, "mean_token_accuracy": 0.8826299905776978, "num_tokens": 885013511.0, "step": 23198 }, { "epoch": 2.9511512530212443, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.01424026489258, "learning_rate": 1e-06, "loss": 0.6697, "mean_token_accuracy": 0.8505733013153076, "num_tokens": 885047291.0, "step": 23199 }, { "epoch": 2.951278463299835, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.97339630126953, "learning_rate": 1e-06, "loss": 0.6482, "mean_token_accuracy": 0.8585109710693359, "num_tokens": 885083116.0, "step": 23200 }, { "epoch": 2.9514056735784253, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.14890670776367, "learning_rate": 1e-06, "loss": 0.6419, "mean_token_accuracy": 0.8594019412994385, "num_tokens": 885123282.0, "step": 23201 }, { "epoch": 2.9515328838570154, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.21475601196289, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8676822185516357, "num_tokens": 885166277.0, "step": 23202 }, { "epoch": 2.9516600941356064, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.24711227416992, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.876101016998291, "num_tokens": 885206414.0, "step": 23203 }, { "epoch": 2.9517873044141965, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.97325134277344, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8702064156532288, "num_tokens": 885240977.0, "step": 23204 }, { "epoch": 2.9519145146927874, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.62575912475586, "learning_rate": 1e-06, "loss": 0.5804, "mean_token_accuracy": 0.8799693584442139, "num_tokens": 885283190.0, "step": 23205 }, { "epoch": 2.9520417249713775, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.09707260131836, "learning_rate": 1e-06, "loss": 0.5869, "mean_token_accuracy": 0.8771142959594727, "num_tokens": 885321340.0, "step": 23206 }, { "epoch": 2.952168935249968, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.31880569458008, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8727719783782959, "num_tokens": 885357524.0, "step": 23207 }, { "epoch": 2.9522961455285586, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.07197189331055, "learning_rate": 1e-06, "loss": 0.7054, "mean_token_accuracy": 0.8378361463546753, "num_tokens": 885389696.0, "step": 23208 }, { "epoch": 2.952423355807149, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.27021408081055, "learning_rate": 1e-06, "loss": 0.5807, "mean_token_accuracy": 0.8789313435554504, "num_tokens": 885429574.0, "step": 23209 }, { "epoch": 2.9525505660857396, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.36840057373047, "learning_rate": 1e-06, "loss": 0.6167, "mean_token_accuracy": 0.8658574819564819, "num_tokens": 885459961.0, "step": 23210 }, { "epoch": 2.95267777636433, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.75493240356445, "learning_rate": 1e-06, "loss": 0.6551, "mean_token_accuracy": 0.8569022417068481, "num_tokens": 885506725.0, "step": 23211 }, { "epoch": 2.9528049866429207, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.499061584472656, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.8679724931716919, "num_tokens": 885549409.0, "step": 23212 }, { "epoch": 2.952932196921511, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.9637451171875, "learning_rate": 1e-06, "loss": 0.6151, "mean_token_accuracy": 0.8710633516311646, "num_tokens": 885589272.0, "step": 23213 }, { "epoch": 2.9530594072001017, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.5250129699707, "learning_rate": 1e-06, "loss": 0.6808, "mean_token_accuracy": 0.8475014567375183, "num_tokens": 885625951.0, "step": 23214 }, { "epoch": 2.9531866174786923, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.910282135009766, "learning_rate": 1e-06, "loss": 0.6183, "mean_token_accuracy": 0.8673737049102783, "num_tokens": 885665844.0, "step": 23215 }, { "epoch": 2.953313827757283, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.765655517578125, "learning_rate": 1e-06, "loss": 0.6346, "mean_token_accuracy": 0.8625650405883789, "num_tokens": 885705616.0, "step": 23216 }, { "epoch": 2.9534410380358733, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.87483215332031, "learning_rate": 1e-06, "loss": 0.5998, "mean_token_accuracy": 0.8758074045181274, "num_tokens": 885752362.0, "step": 23217 }, { "epoch": 2.953568248314464, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.811275482177734, "learning_rate": 1e-06, "loss": 0.6538, "mean_token_accuracy": 0.8585861921310425, "num_tokens": 885792485.0, "step": 23218 }, { "epoch": 2.9536954585930544, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.84288024902344, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8652093410491943, "num_tokens": 885834809.0, "step": 23219 }, { "epoch": 2.953822668871645, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.75891876220703, "learning_rate": 1e-06, "loss": 0.6143, "mean_token_accuracy": 0.8690080642700195, "num_tokens": 885870555.0, "step": 23220 }, { "epoch": 2.9539498791502354, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.399532318115234, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8662936687469482, "num_tokens": 885911911.0, "step": 23221 }, { "epoch": 2.954077089428826, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.838314056396484, "learning_rate": 1e-06, "loss": 0.5902, "mean_token_accuracy": 0.8760773539543152, "num_tokens": 885941094.0, "step": 23222 }, { "epoch": 2.9542042997074165, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.1215705871582, "learning_rate": 1e-06, "loss": 0.6385, "mean_token_accuracy": 0.8630186319351196, "num_tokens": 885978094.0, "step": 23223 }, { "epoch": 2.954331509986007, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.19940948486328, "learning_rate": 1e-06, "loss": 0.5864, "mean_token_accuracy": 0.8814034461975098, "num_tokens": 886014859.0, "step": 23224 }, { "epoch": 2.954458720264597, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.82581329345703, "learning_rate": 1e-06, "loss": 0.6125, "mean_token_accuracy": 0.8711628913879395, "num_tokens": 886048317.0, "step": 23225 }, { "epoch": 2.954585930543188, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.4958610534668, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8668322563171387, "num_tokens": 886081797.0, "step": 23226 }, { "epoch": 2.954713140821778, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.03670883178711, "learning_rate": 1e-06, "loss": 0.6743, "mean_token_accuracy": 0.8508080840110779, "num_tokens": 886124111.0, "step": 23227 }, { "epoch": 2.954840351100369, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.7486572265625, "learning_rate": 1e-06, "loss": 0.6104, "mean_token_accuracy": 0.8753770589828491, "num_tokens": 886162823.0, "step": 23228 }, { "epoch": 2.954967561378959, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.277870178222656, "learning_rate": 1e-06, "loss": 0.6227, "mean_token_accuracy": 0.8656831979751587, "num_tokens": 886204706.0, "step": 23229 }, { "epoch": 2.95509477165755, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.777645111083984, "learning_rate": 1e-06, "loss": 0.6054, "mean_token_accuracy": 0.8717318177223206, "num_tokens": 886241461.0, "step": 23230 }, { "epoch": 2.9552219819361403, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.179569244384766, "learning_rate": 1e-06, "loss": 0.6442, "mean_token_accuracy": 0.8596121072769165, "num_tokens": 886279276.0, "step": 23231 }, { "epoch": 2.955349192214731, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.363372802734375, "learning_rate": 1e-06, "loss": 0.6244, "mean_token_accuracy": 0.8661518096923828, "num_tokens": 886317703.0, "step": 23232 }, { "epoch": 2.9554764024933213, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.291709899902344, "learning_rate": 1e-06, "loss": 0.6095, "mean_token_accuracy": 0.8707914352416992, "num_tokens": 886355235.0, "step": 23233 }, { "epoch": 2.955603612771912, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.493404388427734, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8899345397949219, "num_tokens": 886387354.0, "step": 23234 }, { "epoch": 2.9557308230505024, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.4052734375, "learning_rate": 1e-06, "loss": 0.6335, "mean_token_accuracy": 0.8613933324813843, "num_tokens": 886419742.0, "step": 23235 }, { "epoch": 2.955858033329093, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.46207046508789, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8655722141265869, "num_tokens": 886455218.0, "step": 23236 }, { "epoch": 2.9559852436076834, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.33718490600586, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8642472624778748, "num_tokens": 886494454.0, "step": 23237 }, { "epoch": 2.956112453886274, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.59973907470703, "learning_rate": 1e-06, "loss": 0.6812, "mean_token_accuracy": 0.8415544033050537, "num_tokens": 886532854.0, "step": 23238 }, { "epoch": 2.9562396641648645, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.9229736328125, "learning_rate": 1e-06, "loss": 0.6259, "mean_token_accuracy": 0.8602877855300903, "num_tokens": 886564288.0, "step": 23239 }, { "epoch": 2.956366874443455, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.059139251708984, "learning_rate": 1e-06, "loss": 0.6169, "mean_token_accuracy": 0.8659586906433105, "num_tokens": 886597155.0, "step": 23240 }, { "epoch": 2.9564940847220456, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.70636749267578, "learning_rate": 1e-06, "loss": 0.6067, "mean_token_accuracy": 0.867327094078064, "num_tokens": 886632015.0, "step": 23241 }, { "epoch": 2.956621295000636, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.93929672241211, "learning_rate": 1e-06, "loss": 0.5731, "mean_token_accuracy": 0.8807144165039062, "num_tokens": 886665736.0, "step": 23242 }, { "epoch": 2.9567485052792266, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.03511428833008, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8639912605285645, "num_tokens": 886702652.0, "step": 23243 }, { "epoch": 2.956875715557817, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.524696350097656, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.8618555665016174, "num_tokens": 886743573.0, "step": 23244 }, { "epoch": 2.9570029258364077, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.087440490722656, "learning_rate": 1e-06, "loss": 0.6098, "mean_token_accuracy": 0.8692610859870911, "num_tokens": 886777932.0, "step": 23245 }, { "epoch": 2.957130136114998, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.97592544555664, "learning_rate": 1e-06, "loss": 0.6728, "mean_token_accuracy": 0.8539016842842102, "num_tokens": 886813211.0, "step": 23246 }, { "epoch": 2.9572573463935887, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.23007583618164, "learning_rate": 1e-06, "loss": 0.607, "mean_token_accuracy": 0.8690693378448486, "num_tokens": 886849416.0, "step": 23247 }, { "epoch": 2.9573845566721793, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.36369323730469, "learning_rate": 1e-06, "loss": 0.6792, "mean_token_accuracy": 0.8488569855690002, "num_tokens": 886887983.0, "step": 23248 }, { "epoch": 2.95751176695077, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.08928680419922, "learning_rate": 1e-06, "loss": 0.677, "mean_token_accuracy": 0.8481827974319458, "num_tokens": 886930867.0, "step": 23249 }, { "epoch": 2.95763897722936, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.221595764160156, "learning_rate": 1e-06, "loss": 0.6275, "mean_token_accuracy": 0.8628016710281372, "num_tokens": 886971862.0, "step": 23250 }, { "epoch": 2.957766187507951, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.332489013671875, "learning_rate": 1e-06, "loss": 0.5984, "mean_token_accuracy": 0.8705211877822876, "num_tokens": 887005238.0, "step": 23251 }, { "epoch": 2.957893397786541, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.13939666748047, "learning_rate": 1e-06, "loss": 0.6408, "mean_token_accuracy": 0.8565400838851929, "num_tokens": 887035588.0, "step": 23252 }, { "epoch": 2.958020608065132, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.43875503540039, "learning_rate": 1e-06, "loss": 0.6246, "mean_token_accuracy": 0.8649747371673584, "num_tokens": 887079962.0, "step": 23253 }, { "epoch": 2.958147818343722, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.939449310302734, "learning_rate": 1e-06, "loss": 0.5612, "mean_token_accuracy": 0.8837932348251343, "num_tokens": 887114572.0, "step": 23254 }, { "epoch": 2.958275028622313, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.430030822753906, "learning_rate": 1e-06, "loss": 0.6155, "mean_token_accuracy": 0.8689686059951782, "num_tokens": 887149396.0, "step": 23255 }, { "epoch": 2.958402238900903, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.14442825317383, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8587350845336914, "num_tokens": 887191916.0, "step": 23256 }, { "epoch": 2.9585294491794936, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.229286193847656, "learning_rate": 1e-06, "loss": 0.609, "mean_token_accuracy": 0.8739299774169922, "num_tokens": 887228263.0, "step": 23257 }, { "epoch": 2.958656659458084, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.55167770385742, "learning_rate": 1e-06, "loss": 0.5846, "mean_token_accuracy": 0.8812412023544312, "num_tokens": 887258688.0, "step": 23258 }, { "epoch": 2.9587838697366746, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.064090728759766, "learning_rate": 1e-06, "loss": 0.6463, "mean_token_accuracy": 0.8601327538490295, "num_tokens": 887301565.0, "step": 23259 }, { "epoch": 2.958911080015265, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.350650787353516, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8618942499160767, "num_tokens": 887337511.0, "step": 23260 }, { "epoch": 2.9590382902938557, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.735313415527344, "learning_rate": 1e-06, "loss": 0.598, "mean_token_accuracy": 0.8672129511833191, "num_tokens": 887370694.0, "step": 23261 }, { "epoch": 2.959165500572446, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.644371032714844, "learning_rate": 1e-06, "loss": 0.6523, "mean_token_accuracy": 0.8614394664764404, "num_tokens": 887412131.0, "step": 23262 }, { "epoch": 2.9592927108510367, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.001556396484375, "learning_rate": 1e-06, "loss": 0.6435, "mean_token_accuracy": 0.8582748174667358, "num_tokens": 887447179.0, "step": 23263 }, { "epoch": 2.9594199211296273, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.26995849609375, "learning_rate": 1e-06, "loss": 0.6506, "mean_token_accuracy": 0.8588577508926392, "num_tokens": 887486079.0, "step": 23264 }, { "epoch": 2.959547131408218, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.319114685058594, "learning_rate": 1e-06, "loss": 0.6043, "mean_token_accuracy": 0.8728082180023193, "num_tokens": 887519092.0, "step": 23265 }, { "epoch": 2.9596743416868083, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.28502655029297, "learning_rate": 1e-06, "loss": 0.633, "mean_token_accuracy": 0.8660134077072144, "num_tokens": 887552008.0, "step": 23266 }, { "epoch": 2.959801551965399, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.258644104003906, "learning_rate": 1e-06, "loss": 0.6376, "mean_token_accuracy": 0.8600183725357056, "num_tokens": 887585827.0, "step": 23267 }, { "epoch": 2.9599287622439894, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.0422477722168, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8633240461349487, "num_tokens": 887628041.0, "step": 23268 }, { "epoch": 2.96005597252258, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.32575225830078, "learning_rate": 1e-06, "loss": 0.6102, "mean_token_accuracy": 0.868787407875061, "num_tokens": 887667210.0, "step": 23269 }, { "epoch": 2.9601831828011704, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.61452102661133, "learning_rate": 1e-06, "loss": 0.6046, "mean_token_accuracy": 0.8693152666091919, "num_tokens": 887705429.0, "step": 23270 }, { "epoch": 2.960310393079761, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.37184143066406, "learning_rate": 1e-06, "loss": 0.6926, "mean_token_accuracy": 0.8459854125976562, "num_tokens": 887753188.0, "step": 23271 }, { "epoch": 2.9604376033583515, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.75234603881836, "learning_rate": 1e-06, "loss": 0.6774, "mean_token_accuracy": 0.848944902420044, "num_tokens": 887795907.0, "step": 23272 }, { "epoch": 2.960564813636942, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.84964370727539, "learning_rate": 1e-06, "loss": 0.6334, "mean_token_accuracy": 0.8641737103462219, "num_tokens": 887834859.0, "step": 23273 }, { "epoch": 2.9606920239155325, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.800018310546875, "learning_rate": 1e-06, "loss": 0.6393, "mean_token_accuracy": 0.8600683212280273, "num_tokens": 887879041.0, "step": 23274 }, { "epoch": 2.9608192341941226, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.89339828491211, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8596855998039246, "num_tokens": 887913752.0, "step": 23275 }, { "epoch": 2.9609464444727136, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.537010192871094, "learning_rate": 1e-06, "loss": 0.6086, "mean_token_accuracy": 0.869895339012146, "num_tokens": 887950946.0, "step": 23276 }, { "epoch": 2.9610736547513037, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.23463821411133, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.8604501485824585, "num_tokens": 887993155.0, "step": 23277 }, { "epoch": 2.9612008650298947, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.47919845581055, "learning_rate": 1e-06, "loss": 0.645, "mean_token_accuracy": 0.8642759323120117, "num_tokens": 888027831.0, "step": 23278 }, { "epoch": 2.9613280753084847, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.391902923583984, "learning_rate": 1e-06, "loss": 0.5949, "mean_token_accuracy": 0.8737651109695435, "num_tokens": 888066941.0, "step": 23279 }, { "epoch": 2.9614552855870753, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.319488525390625, "learning_rate": 1e-06, "loss": 0.5781, "mean_token_accuracy": 0.8782443404197693, "num_tokens": 888099739.0, "step": 23280 }, { "epoch": 2.961582495865666, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.307220458984375, "learning_rate": 1e-06, "loss": 0.6206, "mean_token_accuracy": 0.8652859330177307, "num_tokens": 888139675.0, "step": 23281 }, { "epoch": 2.9617097061442563, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.34562683105469, "learning_rate": 1e-06, "loss": 0.653, "mean_token_accuracy": 0.8516745567321777, "num_tokens": 888179374.0, "step": 23282 }, { "epoch": 2.961836916422847, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.30550765991211, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8531621098518372, "num_tokens": 888214139.0, "step": 23283 }, { "epoch": 2.9619641267014374, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.149085998535156, "learning_rate": 1e-06, "loss": 0.6293, "mean_token_accuracy": 0.8627536296844482, "num_tokens": 888246365.0, "step": 23284 }, { "epoch": 2.962091336980028, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.39692306518555, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8581924438476562, "num_tokens": 888290903.0, "step": 23285 }, { "epoch": 2.9622185472586184, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.22896194458008, "learning_rate": 1e-06, "loss": 0.6403, "mean_token_accuracy": 0.8622424006462097, "num_tokens": 888331329.0, "step": 23286 }, { "epoch": 2.962345757537209, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.56476593017578, "learning_rate": 1e-06, "loss": 0.6465, "mean_token_accuracy": 0.8582878112792969, "num_tokens": 888372411.0, "step": 23287 }, { "epoch": 2.9624729678157995, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.70519256591797, "learning_rate": 1e-06, "loss": 0.6315, "mean_token_accuracy": 0.8625696897506714, "num_tokens": 888409778.0, "step": 23288 }, { "epoch": 2.96260017809439, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.15280532836914, "learning_rate": 1e-06, "loss": 0.611, "mean_token_accuracy": 0.8706390261650085, "num_tokens": 888453593.0, "step": 23289 }, { "epoch": 2.9627273883729806, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.58480453491211, "learning_rate": 1e-06, "loss": 0.5829, "mean_token_accuracy": 0.8759757280349731, "num_tokens": 888488245.0, "step": 23290 }, { "epoch": 2.962854598651571, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.828060150146484, "learning_rate": 1e-06, "loss": 0.6283, "mean_token_accuracy": 0.8646360635757446, "num_tokens": 888524306.0, "step": 23291 }, { "epoch": 2.9629818089301616, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.05381774902344, "learning_rate": 1e-06, "loss": 0.5877, "mean_token_accuracy": 0.875937283039093, "num_tokens": 888563791.0, "step": 23292 }, { "epoch": 2.963109019208752, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.674251556396484, "learning_rate": 1e-06, "loss": 0.6171, "mean_token_accuracy": 0.8685646057128906, "num_tokens": 888605021.0, "step": 23293 }, { "epoch": 2.9632362294873427, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.12679672241211, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8643968105316162, "num_tokens": 888647382.0, "step": 23294 }, { "epoch": 2.963363439765933, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.186553955078125, "learning_rate": 1e-06, "loss": 0.64, "mean_token_accuracy": 0.863472044467926, "num_tokens": 888682174.0, "step": 23295 }, { "epoch": 2.9634906500445237, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.76926040649414, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.865837574005127, "num_tokens": 888724401.0, "step": 23296 }, { "epoch": 2.9636178603231143, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.85062789916992, "learning_rate": 1e-06, "loss": 0.6044, "mean_token_accuracy": 0.871181845664978, "num_tokens": 888766230.0, "step": 23297 }, { "epoch": 2.963745070601705, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.38199996948242, "learning_rate": 1e-06, "loss": 0.7, "mean_token_accuracy": 0.8428906202316284, "num_tokens": 888811865.0, "step": 23298 }, { "epoch": 2.9638722808802953, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.87344741821289, "learning_rate": 1e-06, "loss": 0.6288, "mean_token_accuracy": 0.8650197982788086, "num_tokens": 888849549.0, "step": 23299 }, { "epoch": 2.9639994911588854, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.3116455078125, "learning_rate": 1e-06, "loss": 0.6211, "mean_token_accuracy": 0.8652847409248352, "num_tokens": 888881305.0, "step": 23300 }, { "epoch": 2.9641267014374764, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.552978515625, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.869151771068573, "num_tokens": 888917862.0, "step": 23301 }, { "epoch": 2.9642539117160664, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.925052642822266, "learning_rate": 1e-06, "loss": 0.6267, "mean_token_accuracy": 0.8651622533798218, "num_tokens": 888953883.0, "step": 23302 }, { "epoch": 2.9643811219946574, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.29867935180664, "learning_rate": 1e-06, "loss": 0.5801, "mean_token_accuracy": 0.8812093734741211, "num_tokens": 888991052.0, "step": 23303 }, { "epoch": 2.9645083322732475, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.408409118652344, "learning_rate": 1e-06, "loss": 0.5975, "mean_token_accuracy": 0.8753910064697266, "num_tokens": 889022523.0, "step": 23304 }, { "epoch": 2.964635542551838, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.188209533691406, "learning_rate": 1e-06, "loss": 0.6548, "mean_token_accuracy": 0.8551341891288757, "num_tokens": 889063470.0, "step": 23305 }, { "epoch": 2.9647627528304286, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.34700012207031, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8722639083862305, "num_tokens": 889101160.0, "step": 23306 }, { "epoch": 2.964889963109019, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.250064849853516, "learning_rate": 1e-06, "loss": 0.6379, "mean_token_accuracy": 0.8614097833633423, "num_tokens": 889139128.0, "step": 23307 }, { "epoch": 2.9650171733876096, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.18287658691406, "learning_rate": 1e-06, "loss": 0.6966, "mean_token_accuracy": 0.8402500748634338, "num_tokens": 889175574.0, "step": 23308 }, { "epoch": 2.9651443836662, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.345829010009766, "learning_rate": 1e-06, "loss": 0.647, "mean_token_accuracy": 0.8603660464286804, "num_tokens": 889216520.0, "step": 23309 }, { "epoch": 2.9652715939447907, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.60459518432617, "learning_rate": 1e-06, "loss": 0.5933, "mean_token_accuracy": 0.8761672973632812, "num_tokens": 889256746.0, "step": 23310 }, { "epoch": 2.965398804223381, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.43858337402344, "learning_rate": 1e-06, "loss": 0.6515, "mean_token_accuracy": 0.859796941280365, "num_tokens": 889298868.0, "step": 23311 }, { "epoch": 2.9655260145019717, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.238792419433594, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.8591794371604919, "num_tokens": 889337758.0, "step": 23312 }, { "epoch": 2.9656532247805623, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.661128997802734, "learning_rate": 1e-06, "loss": 0.6212, "mean_token_accuracy": 0.8696885704994202, "num_tokens": 889375308.0, "step": 23313 }, { "epoch": 2.965780435059153, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.95488357543945, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8717892169952393, "num_tokens": 889413735.0, "step": 23314 }, { "epoch": 2.9659076453377433, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.85172653198242, "learning_rate": 1e-06, "loss": 0.6278, "mean_token_accuracy": 0.8659993410110474, "num_tokens": 889451585.0, "step": 23315 }, { "epoch": 2.966034855616334, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.64154052734375, "learning_rate": 1e-06, "loss": 0.6886, "mean_token_accuracy": 0.8470392823219299, "num_tokens": 889487405.0, "step": 23316 }, { "epoch": 2.9661620658949244, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.96820068359375, "learning_rate": 1e-06, "loss": 0.6656, "mean_token_accuracy": 0.8510288000106812, "num_tokens": 889523051.0, "step": 23317 }, { "epoch": 2.966289276173515, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.01243209838867, "learning_rate": 1e-06, "loss": 0.6726, "mean_token_accuracy": 0.8454480171203613, "num_tokens": 889559822.0, "step": 23318 }, { "epoch": 2.9664164864521054, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.34830856323242, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.8627564907073975, "num_tokens": 889591598.0, "step": 23319 }, { "epoch": 2.966543696730696, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.061187744140625, "learning_rate": 1e-06, "loss": 0.5938, "mean_token_accuracy": 0.8746511936187744, "num_tokens": 889629787.0, "step": 23320 }, { "epoch": 2.9666709070092865, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.300655364990234, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8711763620376587, "num_tokens": 889668730.0, "step": 23321 }, { "epoch": 2.966798117287877, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.416168212890625, "learning_rate": 1e-06, "loss": 0.6392, "mean_token_accuracy": 0.8622013330459595, "num_tokens": 889705215.0, "step": 23322 }, { "epoch": 2.966925327566467, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.19850158691406, "learning_rate": 1e-06, "loss": 0.6328, "mean_token_accuracy": 0.8652975559234619, "num_tokens": 889738528.0, "step": 23323 }, { "epoch": 2.967052537845058, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.226097106933594, "learning_rate": 1e-06, "loss": 0.6264, "mean_token_accuracy": 0.8676347732543945, "num_tokens": 889771694.0, "step": 23324 }, { "epoch": 2.967179748123648, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.20416259765625, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8779836893081665, "num_tokens": 889806096.0, "step": 23325 }, { "epoch": 2.967306958402239, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.57126998901367, "learning_rate": 1e-06, "loss": 0.6418, "mean_token_accuracy": 0.8638464212417603, "num_tokens": 889848514.0, "step": 23326 }, { "epoch": 2.967434168680829, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.008087158203125, "learning_rate": 1e-06, "loss": 0.5695, "mean_token_accuracy": 0.884711503982544, "num_tokens": 889889689.0, "step": 23327 }, { "epoch": 2.96756137895942, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.72031021118164, "learning_rate": 1e-06, "loss": 0.6215, "mean_token_accuracy": 0.8669207692146301, "num_tokens": 889925550.0, "step": 23328 }, { "epoch": 2.9676885892380103, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.48292922973633, "learning_rate": 1e-06, "loss": 0.6665, "mean_token_accuracy": 0.8503044247627258, "num_tokens": 889961797.0, "step": 23329 }, { "epoch": 2.967815799516601, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.557579040527344, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8703272938728333, "num_tokens": 889997935.0, "step": 23330 }, { "epoch": 2.9679430097951913, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.526161193847656, "learning_rate": 1e-06, "loss": 0.5885, "mean_token_accuracy": 0.8768779039382935, "num_tokens": 890037900.0, "step": 23331 }, { "epoch": 2.968070220073782, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.69019317626953, "learning_rate": 1e-06, "loss": 0.612, "mean_token_accuracy": 0.8710200786590576, "num_tokens": 890074814.0, "step": 23332 }, { "epoch": 2.9681974303523724, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.344547271728516, "learning_rate": 1e-06, "loss": 0.5955, "mean_token_accuracy": 0.8775365352630615, "num_tokens": 890107091.0, "step": 23333 }, { "epoch": 2.968324640630963, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.48666000366211, "learning_rate": 1e-06, "loss": 0.6083, "mean_token_accuracy": 0.8723065257072449, "num_tokens": 890140558.0, "step": 23334 }, { "epoch": 2.9684518509095534, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.255218505859375, "learning_rate": 1e-06, "loss": 0.6253, "mean_token_accuracy": 0.8642136454582214, "num_tokens": 890174563.0, "step": 23335 }, { "epoch": 2.968579061188144, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.59634780883789, "learning_rate": 1e-06, "loss": 0.6316, "mean_token_accuracy": 0.862836480140686, "num_tokens": 890210584.0, "step": 23336 }, { "epoch": 2.9687062714667345, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.09835433959961, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8790550827980042, "num_tokens": 890251955.0, "step": 23337 }, { "epoch": 2.968833481745325, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.00605010986328, "learning_rate": 1e-06, "loss": 0.6309, "mean_token_accuracy": 0.8599883317947388, "num_tokens": 890289095.0, "step": 23338 }, { "epoch": 2.9689606920239155, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.641902923583984, "learning_rate": 1e-06, "loss": 0.6271, "mean_token_accuracy": 0.8660240173339844, "num_tokens": 890327864.0, "step": 23339 }, { "epoch": 2.969087902302506, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.57239532470703, "learning_rate": 1e-06, "loss": 0.63, "mean_token_accuracy": 0.8641998171806335, "num_tokens": 890364244.0, "step": 23340 }, { "epoch": 2.9692151125810966, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.91679763793945, "learning_rate": 1e-06, "loss": 0.6815, "mean_token_accuracy": 0.8503289222717285, "num_tokens": 890397652.0, "step": 23341 }, { "epoch": 2.969342322859687, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.59865188598633, "learning_rate": 1e-06, "loss": 0.613, "mean_token_accuracy": 0.8710096478462219, "num_tokens": 890435598.0, "step": 23342 }, { "epoch": 2.9694695331382777, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.19990539550781, "learning_rate": 1e-06, "loss": 0.6265, "mean_token_accuracy": 0.8682428598403931, "num_tokens": 890473445.0, "step": 23343 }, { "epoch": 2.969596743416868, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.265541076660156, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8684939742088318, "num_tokens": 890514263.0, "step": 23344 }, { "epoch": 2.9697239536954587, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.937435150146484, "learning_rate": 1e-06, "loss": 0.6, "mean_token_accuracy": 0.8688066005706787, "num_tokens": 890554123.0, "step": 23345 }, { "epoch": 2.9698511639740492, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.53264617919922, "learning_rate": 1e-06, "loss": 0.6416, "mean_token_accuracy": 0.862991452217102, "num_tokens": 890593938.0, "step": 23346 }, { "epoch": 2.9699783742526398, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.77736282348633, "learning_rate": 1e-06, "loss": 0.659, "mean_token_accuracy": 0.855570375919342, "num_tokens": 890632260.0, "step": 23347 }, { "epoch": 2.97010558453123, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.7841682434082, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8625184297561646, "num_tokens": 890666852.0, "step": 23348 }, { "epoch": 2.970232794809821, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.9207649230957, "learning_rate": 1e-06, "loss": 0.6356, "mean_token_accuracy": 0.8597677946090698, "num_tokens": 890707959.0, "step": 23349 }, { "epoch": 2.970360005088411, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.49245071411133, "learning_rate": 1e-06, "loss": 0.713, "mean_token_accuracy": 0.8436795473098755, "num_tokens": 890748729.0, "step": 23350 }, { "epoch": 2.970487215367002, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.818519592285156, "learning_rate": 1e-06, "loss": 0.5865, "mean_token_accuracy": 0.8777420520782471, "num_tokens": 890782116.0, "step": 23351 }, { "epoch": 2.970614425645592, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.8399543762207, "learning_rate": 1e-06, "loss": 0.671, "mean_token_accuracy": 0.8521242141723633, "num_tokens": 890827309.0, "step": 23352 }, { "epoch": 2.970741635924183, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.03089904785156, "learning_rate": 1e-06, "loss": 0.6705, "mean_token_accuracy": 0.8552205562591553, "num_tokens": 890863879.0, "step": 23353 }, { "epoch": 2.970868846202773, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.92661666870117, "learning_rate": 1e-06, "loss": 0.6205, "mean_token_accuracy": 0.8661478757858276, "num_tokens": 890901184.0, "step": 23354 }, { "epoch": 2.9709960564813636, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.00186538696289, "learning_rate": 1e-06, "loss": 0.5927, "mean_token_accuracy": 0.8743199110031128, "num_tokens": 890937779.0, "step": 23355 }, { "epoch": 2.971123266759954, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.51813507080078, "learning_rate": 1e-06, "loss": 0.5823, "mean_token_accuracy": 0.8794341683387756, "num_tokens": 890978055.0, "step": 23356 }, { "epoch": 2.9712504770385446, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.96488952636719, "learning_rate": 1e-06, "loss": 0.7619, "mean_token_accuracy": 0.8336542844772339, "num_tokens": 891010985.0, "step": 23357 }, { "epoch": 2.971377687317135, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.706485748291016, "learning_rate": 1e-06, "loss": 0.6157, "mean_token_accuracy": 0.8697994351387024, "num_tokens": 891045904.0, "step": 23358 }, { "epoch": 2.9715048975957257, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.797916412353516, "learning_rate": 1e-06, "loss": 0.5997, "mean_token_accuracy": 0.8696008920669556, "num_tokens": 891086240.0, "step": 23359 }, { "epoch": 2.971632107874316, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.14344024658203, "learning_rate": 1e-06, "loss": 0.6371, "mean_token_accuracy": 0.8615678548812866, "num_tokens": 891126482.0, "step": 23360 }, { "epoch": 2.9717593181529067, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.25783157348633, "learning_rate": 1e-06, "loss": 0.6567, "mean_token_accuracy": 0.8530360460281372, "num_tokens": 891165982.0, "step": 23361 }, { "epoch": 2.9718865284314973, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.21001052856445, "learning_rate": 1e-06, "loss": 0.6741, "mean_token_accuracy": 0.8497633934020996, "num_tokens": 891199101.0, "step": 23362 }, { "epoch": 2.972013738710088, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.4668083190918, "learning_rate": 1e-06, "loss": 0.6384, "mean_token_accuracy": 0.8633441925048828, "num_tokens": 891231511.0, "step": 23363 }, { "epoch": 2.9721409489886783, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.15003204345703, "learning_rate": 1e-06, "loss": 0.6717, "mean_token_accuracy": 0.8530531525611877, "num_tokens": 891262369.0, "step": 23364 }, { "epoch": 2.972268159267269, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.822723388671875, "learning_rate": 1e-06, "loss": 0.6682, "mean_token_accuracy": 0.8503252267837524, "num_tokens": 891304206.0, "step": 23365 }, { "epoch": 2.9723953695458594, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 52.823402404785156, "learning_rate": 1e-06, "loss": 0.6678, "mean_token_accuracy": 0.8521855473518372, "num_tokens": 891348006.0, "step": 23366 }, { "epoch": 2.97252257982445, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.89360809326172, "learning_rate": 1e-06, "loss": 0.6038, "mean_token_accuracy": 0.8719505071640015, "num_tokens": 891386851.0, "step": 23367 }, { "epoch": 2.9726497901030404, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.725921630859375, "learning_rate": 1e-06, "loss": 0.5966, "mean_token_accuracy": 0.8743069767951965, "num_tokens": 891423582.0, "step": 23368 }, { "epoch": 2.972777000381631, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.903194427490234, "learning_rate": 1e-06, "loss": 0.5686, "mean_token_accuracy": 0.8810588121414185, "num_tokens": 891463773.0, "step": 23369 }, { "epoch": 2.9729042106602215, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.81997299194336, "learning_rate": 1e-06, "loss": 0.5855, "mean_token_accuracy": 0.8755671381950378, "num_tokens": 891498117.0, "step": 23370 }, { "epoch": 2.973031420938812, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.91604995727539, "learning_rate": 1e-06, "loss": 0.6032, "mean_token_accuracy": 0.8731088638305664, "num_tokens": 891529055.0, "step": 23371 }, { "epoch": 2.9731586312174025, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 52.739410400390625, "learning_rate": 1e-06, "loss": 0.5921, "mean_token_accuracy": 0.8736928105354309, "num_tokens": 891567370.0, "step": 23372 }, { "epoch": 2.9732858414959926, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.092708587646484, "learning_rate": 1e-06, "loss": 0.6189, "mean_token_accuracy": 0.8669277429580688, "num_tokens": 891602737.0, "step": 23373 }, { "epoch": 2.9734130517745836, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.51028060913086, "learning_rate": 1e-06, "loss": 0.661, "mean_token_accuracy": 0.8584312796592712, "num_tokens": 891639713.0, "step": 23374 }, { "epoch": 2.9735402620531737, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.734519958496094, "learning_rate": 1e-06, "loss": 0.6012, "mean_token_accuracy": 0.8764055967330933, "num_tokens": 891673931.0, "step": 23375 }, { "epoch": 2.9736674723317646, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.911563873291016, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8623143434524536, "num_tokens": 891708015.0, "step": 23376 }, { "epoch": 2.9737946826103547, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.093074798583984, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8599593639373779, "num_tokens": 891746023.0, "step": 23377 }, { "epoch": 2.9739218928889453, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 52.822566986083984, "learning_rate": 1e-06, "loss": 0.6445, "mean_token_accuracy": 0.8630407452583313, "num_tokens": 891777767.0, "step": 23378 }, { "epoch": 2.974049103167536, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.8558464050293, "learning_rate": 1e-06, "loss": 0.6165, "mean_token_accuracy": 0.867283046245575, "num_tokens": 891810816.0, "step": 23379 }, { "epoch": 2.9741763134461263, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.301109313964844, "learning_rate": 1e-06, "loss": 0.643, "mean_token_accuracy": 0.8571130037307739, "num_tokens": 891848810.0, "step": 23380 }, { "epoch": 2.974303523724717, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.51340866088867, "learning_rate": 1e-06, "loss": 0.6269, "mean_token_accuracy": 0.8672966361045837, "num_tokens": 891892553.0, "step": 23381 }, { "epoch": 2.9744307340033074, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.64264678955078, "learning_rate": 1e-06, "loss": 0.589, "mean_token_accuracy": 0.87508225440979, "num_tokens": 891927539.0, "step": 23382 }, { "epoch": 2.974557944281898, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.00203323364258, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8683059215545654, "num_tokens": 891959577.0, "step": 23383 }, { "epoch": 2.9746851545604884, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.7171745300293, "learning_rate": 1e-06, "loss": 0.7179, "mean_token_accuracy": 0.8380460739135742, "num_tokens": 892000799.0, "step": 23384 }, { "epoch": 2.974812364839079, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.09514617919922, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.8576163053512573, "num_tokens": 892042200.0, "step": 23385 }, { "epoch": 2.9749395751176695, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.5322380065918, "learning_rate": 1e-06, "loss": 0.6647, "mean_token_accuracy": 0.8535671830177307, "num_tokens": 892084838.0, "step": 23386 }, { "epoch": 2.97506678539626, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.074432373046875, "learning_rate": 1e-06, "loss": 0.6739, "mean_token_accuracy": 0.8484019041061401, "num_tokens": 892126442.0, "step": 23387 }, { "epoch": 2.9751939956748505, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.57574462890625, "learning_rate": 1e-06, "loss": 0.6539, "mean_token_accuracy": 0.8517762422561646, "num_tokens": 892165019.0, "step": 23388 }, { "epoch": 2.975321205953441, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.96337127685547, "learning_rate": 1e-06, "loss": 0.6238, "mean_token_accuracy": 0.8672063946723938, "num_tokens": 892202291.0, "step": 23389 }, { "epoch": 2.9754484162320316, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.52263641357422, "learning_rate": 1e-06, "loss": 0.6666, "mean_token_accuracy": 0.8546684980392456, "num_tokens": 892235159.0, "step": 23390 }, { "epoch": 2.975575626510622, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.9660530090332, "learning_rate": 1e-06, "loss": 0.671, "mean_token_accuracy": 0.8513340950012207, "num_tokens": 892271967.0, "step": 23391 }, { "epoch": 2.9757028367892127, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.49757766723633, "learning_rate": 1e-06, "loss": 0.6838, "mean_token_accuracy": 0.8497503995895386, "num_tokens": 892306230.0, "step": 23392 }, { "epoch": 2.975830047067803, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.4898796081543, "learning_rate": 1e-06, "loss": 0.6177, "mean_token_accuracy": 0.8646718859672546, "num_tokens": 892342118.0, "step": 23393 }, { "epoch": 2.9759572573463937, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.78254699707031, "learning_rate": 1e-06, "loss": 0.6161, "mean_token_accuracy": 0.8700485825538635, "num_tokens": 892380764.0, "step": 23394 }, { "epoch": 2.9760844676249842, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.577884674072266, "learning_rate": 1e-06, "loss": 0.6466, "mean_token_accuracy": 0.8611082434654236, "num_tokens": 892421113.0, "step": 23395 }, { "epoch": 2.9762116779035748, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.841957092285156, "learning_rate": 1e-06, "loss": 0.6071, "mean_token_accuracy": 0.8715096712112427, "num_tokens": 892457512.0, "step": 23396 }, { "epoch": 2.9763388881821653, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.455291748046875, "learning_rate": 1e-06, "loss": 0.6103, "mean_token_accuracy": 0.8695869445800781, "num_tokens": 892493599.0, "step": 23397 }, { "epoch": 2.9764660984607554, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.078155517578125, "learning_rate": 1e-06, "loss": 0.6514, "mean_token_accuracy": 0.853182852268219, "num_tokens": 892528474.0, "step": 23398 }, { "epoch": 2.9765933087393464, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.23395919799805, "learning_rate": 1e-06, "loss": 0.6039, "mean_token_accuracy": 0.8715630769729614, "num_tokens": 892569591.0, "step": 23399 }, { "epoch": 2.9767205190179364, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.48883056640625, "learning_rate": 1e-06, "loss": 0.6089, "mean_token_accuracy": 0.8681814670562744, "num_tokens": 892607185.0, "step": 23400 }, { "epoch": 2.9768477292965274, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.83171844482422, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.8670000433921814, "num_tokens": 892641453.0, "step": 23401 }, { "epoch": 2.9769749395751175, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.33620071411133, "learning_rate": 1e-06, "loss": 0.6523, "mean_token_accuracy": 0.8592013120651245, "num_tokens": 892676802.0, "step": 23402 }, { "epoch": 2.977102149853708, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.339412689208984, "learning_rate": 1e-06, "loss": 0.6024, "mean_token_accuracy": 0.8754373788833618, "num_tokens": 892711694.0, "step": 23403 }, { "epoch": 2.9772293601322986, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.375335693359375, "learning_rate": 1e-06, "loss": 0.6525, "mean_token_accuracy": 0.8548653721809387, "num_tokens": 892752662.0, "step": 23404 }, { "epoch": 2.977356570410889, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.12696075439453, "learning_rate": 1e-06, "loss": 0.6306, "mean_token_accuracy": 0.8638159036636353, "num_tokens": 892790170.0, "step": 23405 }, { "epoch": 2.9774837806894796, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.22902297973633, "learning_rate": 1e-06, "loss": 0.6969, "mean_token_accuracy": 0.8461543321609497, "num_tokens": 892831437.0, "step": 23406 }, { "epoch": 2.97761099096807, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.7725830078125, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.8856321573257446, "num_tokens": 892869022.0, "step": 23407 }, { "epoch": 2.9777382012466607, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.07093811035156, "learning_rate": 1e-06, "loss": 0.6298, "mean_token_accuracy": 0.8654628992080688, "num_tokens": 892910738.0, "step": 23408 }, { "epoch": 2.977865411525251, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.74260330200195, "learning_rate": 1e-06, "loss": 0.6123, "mean_token_accuracy": 0.8731314539909363, "num_tokens": 892952055.0, "step": 23409 }, { "epoch": 2.9779926218038417, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.202308654785156, "learning_rate": 1e-06, "loss": 0.6574, "mean_token_accuracy": 0.8554266691207886, "num_tokens": 892985843.0, "step": 23410 }, { "epoch": 2.9781198320824323, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.44626235961914, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8545007705688477, "num_tokens": 893031197.0, "step": 23411 }, { "epoch": 2.978247042361023, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.67972946166992, "learning_rate": 1e-06, "loss": 0.5743, "mean_token_accuracy": 0.8780465126037598, "num_tokens": 893068113.0, "step": 23412 }, { "epoch": 2.9783742526396133, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.021888732910156, "learning_rate": 1e-06, "loss": 0.5912, "mean_token_accuracy": 0.8750457763671875, "num_tokens": 893109148.0, "step": 23413 }, { "epoch": 2.978501462918204, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.791080474853516, "learning_rate": 1e-06, "loss": 0.6124, "mean_token_accuracy": 0.8692450523376465, "num_tokens": 893151406.0, "step": 23414 }, { "epoch": 2.9786286731967944, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.44577407836914, "learning_rate": 1e-06, "loss": 0.6087, "mean_token_accuracy": 0.8682118058204651, "num_tokens": 893190209.0, "step": 23415 }, { "epoch": 2.978755883475385, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.018333435058594, "learning_rate": 1e-06, "loss": 0.6413, "mean_token_accuracy": 0.8647579550743103, "num_tokens": 893232949.0, "step": 23416 }, { "epoch": 2.9788830937539754, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.87248992919922, "learning_rate": 1e-06, "loss": 0.6282, "mean_token_accuracy": 0.8644086718559265, "num_tokens": 893273931.0, "step": 23417 }, { "epoch": 2.979010304032566, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.01835632324219, "learning_rate": 1e-06, "loss": 0.6901, "mean_token_accuracy": 0.8423316478729248, "num_tokens": 893317465.0, "step": 23418 }, { "epoch": 2.9791375143111565, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.76594161987305, "learning_rate": 1e-06, "loss": 0.6865, "mean_token_accuracy": 0.8480223417282104, "num_tokens": 893354089.0, "step": 23419 }, { "epoch": 2.979264724589747, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.468265533447266, "learning_rate": 1e-06, "loss": 0.6333, "mean_token_accuracy": 0.8599876761436462, "num_tokens": 893388051.0, "step": 23420 }, { "epoch": 2.979391934868337, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.103599548339844, "learning_rate": 1e-06, "loss": 0.6524, "mean_token_accuracy": 0.8585859537124634, "num_tokens": 893426191.0, "step": 23421 }, { "epoch": 2.979519145146928, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.50912094116211, "learning_rate": 1e-06, "loss": 0.6533, "mean_token_accuracy": 0.8560913801193237, "num_tokens": 893462134.0, "step": 23422 }, { "epoch": 2.979646355425518, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.168704986572266, "learning_rate": 1e-06, "loss": 0.6645, "mean_token_accuracy": 0.8582165241241455, "num_tokens": 893503159.0, "step": 23423 }, { "epoch": 2.979773565704109, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.021785736083984, "learning_rate": 1e-06, "loss": 0.7034, "mean_token_accuracy": 0.8462275266647339, "num_tokens": 893538379.0, "step": 23424 }, { "epoch": 2.979900775982699, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.4852180480957, "learning_rate": 1e-06, "loss": 0.6608, "mean_token_accuracy": 0.8516799807548523, "num_tokens": 893580431.0, "step": 23425 }, { "epoch": 2.98002798626129, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.54088592529297, "learning_rate": 1e-06, "loss": 0.6662, "mean_token_accuracy": 0.8571668863296509, "num_tokens": 893622412.0, "step": 23426 }, { "epoch": 2.9801551965398803, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.180076599121094, "learning_rate": 1e-06, "loss": 0.6332, "mean_token_accuracy": 0.8664542436599731, "num_tokens": 893658575.0, "step": 23427 }, { "epoch": 2.980282406818471, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.398040771484375, "learning_rate": 1e-06, "loss": 0.606, "mean_token_accuracy": 0.871660053730011, "num_tokens": 893702812.0, "step": 23428 }, { "epoch": 2.9804096170970613, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.297725677490234, "learning_rate": 1e-06, "loss": 0.6578, "mean_token_accuracy": 0.8551533222198486, "num_tokens": 893742785.0, "step": 23429 }, { "epoch": 2.980536827375652, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.45146942138672, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8718242645263672, "num_tokens": 893782310.0, "step": 23430 }, { "epoch": 2.9806640376542424, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.30528259277344, "learning_rate": 1e-06, "loss": 0.5696, "mean_token_accuracy": 0.8802781105041504, "num_tokens": 893822794.0, "step": 23431 }, { "epoch": 2.980791247932833, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.873138427734375, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8707752823829651, "num_tokens": 893870462.0, "step": 23432 }, { "epoch": 2.9809184582114234, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.205875396728516, "learning_rate": 1e-06, "loss": 0.6402, "mean_token_accuracy": 0.8631983995437622, "num_tokens": 893910654.0, "step": 23433 }, { "epoch": 2.981045668490014, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.50965881347656, "learning_rate": 1e-06, "loss": 0.6311, "mean_token_accuracy": 0.8608596324920654, "num_tokens": 893948819.0, "step": 23434 }, { "epoch": 2.9811728787686045, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.29603958129883, "learning_rate": 1e-06, "loss": 0.6068, "mean_token_accuracy": 0.8701925277709961, "num_tokens": 893983325.0, "step": 23435 }, { "epoch": 2.981300089047195, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.54783248901367, "learning_rate": 1e-06, "loss": 0.6312, "mean_token_accuracy": 0.8622468709945679, "num_tokens": 894023712.0, "step": 23436 }, { "epoch": 2.9814272993257855, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.03561019897461, "learning_rate": 1e-06, "loss": 0.595, "mean_token_accuracy": 0.8733536601066589, "num_tokens": 894059719.0, "step": 23437 }, { "epoch": 2.981554509604376, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.91233825683594, "learning_rate": 1e-06, "loss": 0.6304, "mean_token_accuracy": 0.8649891018867493, "num_tokens": 894100150.0, "step": 23438 }, { "epoch": 2.9816817198829666, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.49699020385742, "learning_rate": 1e-06, "loss": 0.7173, "mean_token_accuracy": 0.8429675698280334, "num_tokens": 894134336.0, "step": 23439 }, { "epoch": 2.981808930161557, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.44760513305664, "learning_rate": 1e-06, "loss": 0.6802, "mean_token_accuracy": 0.8493607640266418, "num_tokens": 894168972.0, "step": 23440 }, { "epoch": 2.9819361404401477, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.53135299682617, "learning_rate": 1e-06, "loss": 0.6372, "mean_token_accuracy": 0.8614521026611328, "num_tokens": 894204641.0, "step": 23441 }, { "epoch": 2.982063350718738, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.54175567626953, "learning_rate": 1e-06, "loss": 0.6351, "mean_token_accuracy": 0.8654516339302063, "num_tokens": 894248403.0, "step": 23442 }, { "epoch": 2.9821905609973287, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.25654220581055, "learning_rate": 1e-06, "loss": 0.5688, "mean_token_accuracy": 0.8813414573669434, "num_tokens": 894284035.0, "step": 23443 }, { "epoch": 2.9823177712759192, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.42901611328125, "learning_rate": 1e-06, "loss": 0.6117, "mean_token_accuracy": 0.8688153028488159, "num_tokens": 894321804.0, "step": 23444 }, { "epoch": 2.9824449815545098, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.58887481689453, "learning_rate": 1e-06, "loss": 0.6045, "mean_token_accuracy": 0.871530294418335, "num_tokens": 894355648.0, "step": 23445 }, { "epoch": 2.9825721918331, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.0437126159668, "learning_rate": 1e-06, "loss": 0.6243, "mean_token_accuracy": 0.8633403182029724, "num_tokens": 894392947.0, "step": 23446 }, { "epoch": 2.982699402111691, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.870723724365234, "learning_rate": 1e-06, "loss": 0.6655, "mean_token_accuracy": 0.8534379005432129, "num_tokens": 894433437.0, "step": 23447 }, { "epoch": 2.982826612390281, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.105308532714844, "learning_rate": 1e-06, "loss": 0.6107, "mean_token_accuracy": 0.8692582845687866, "num_tokens": 894481428.0, "step": 23448 }, { "epoch": 2.982953822668872, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.67943572998047, "learning_rate": 1e-06, "loss": 0.634, "mean_token_accuracy": 0.8643244504928589, "num_tokens": 894518055.0, "step": 23449 }, { "epoch": 2.983081032947462, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.102840423583984, "learning_rate": 1e-06, "loss": 0.658, "mean_token_accuracy": 0.8574494123458862, "num_tokens": 894562693.0, "step": 23450 }, { "epoch": 2.983208243226053, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.507198333740234, "learning_rate": 1e-06, "loss": 0.6396, "mean_token_accuracy": 0.8618679046630859, "num_tokens": 894604973.0, "step": 23451 }, { "epoch": 2.983335453504643, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.17005920410156, "learning_rate": 1e-06, "loss": 0.6093, "mean_token_accuracy": 0.8694096207618713, "num_tokens": 894645672.0, "step": 23452 }, { "epoch": 2.9834626637832335, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.570377349853516, "learning_rate": 1e-06, "loss": 0.6409, "mean_token_accuracy": 0.8619663715362549, "num_tokens": 894686702.0, "step": 23453 }, { "epoch": 2.983589874061824, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.817466735839844, "learning_rate": 1e-06, "loss": 0.6064, "mean_token_accuracy": 0.8716485500335693, "num_tokens": 894722768.0, "step": 23454 }, { "epoch": 2.9837170843404146, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.99504852294922, "learning_rate": 1e-06, "loss": 0.6001, "mean_token_accuracy": 0.8765774965286255, "num_tokens": 894759216.0, "step": 23455 }, { "epoch": 2.983844294619005, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 52.390933990478516, "learning_rate": 1e-06, "loss": 0.61, "mean_token_accuracy": 0.8675283193588257, "num_tokens": 894799691.0, "step": 23456 }, { "epoch": 2.9839715048975957, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 54.237308502197266, "learning_rate": 1e-06, "loss": 0.6571, "mean_token_accuracy": 0.8599445819854736, "num_tokens": 894835869.0, "step": 23457 }, { "epoch": 2.984098715176186, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 52.653324127197266, "learning_rate": 1e-06, "loss": 0.6928, "mean_token_accuracy": 0.841968834400177, "num_tokens": 894875706.0, "step": 23458 }, { "epoch": 2.9842259254547767, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 53.70732498168945, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8698118925094604, "num_tokens": 894920135.0, "step": 23459 }, { "epoch": 2.9843531357333672, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.1051139831543, "learning_rate": 1e-06, "loss": 0.6217, "mean_token_accuracy": 0.8661085367202759, "num_tokens": 894960825.0, "step": 23460 }, { "epoch": 2.9844803460119578, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.54512023925781, "learning_rate": 1e-06, "loss": 0.6362, "mean_token_accuracy": 0.8654284477233887, "num_tokens": 895001935.0, "step": 23461 }, { "epoch": 2.9846075562905483, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.99083709716797, "learning_rate": 1e-06, "loss": 0.5917, "mean_token_accuracy": 0.8732547163963318, "num_tokens": 895033559.0, "step": 23462 }, { "epoch": 2.984734766569139, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.416648864746094, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8640978336334229, "num_tokens": 895070958.0, "step": 23463 }, { "epoch": 2.9848619768477294, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.06570053100586, "learning_rate": 1e-06, "loss": 0.6359, "mean_token_accuracy": 0.8630561828613281, "num_tokens": 895111533.0, "step": 23464 }, { "epoch": 2.98498918712632, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.54729461669922, "learning_rate": 1e-06, "loss": 0.6404, "mean_token_accuracy": 0.8600070476531982, "num_tokens": 895149861.0, "step": 23465 }, { "epoch": 2.9851163974049104, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.186859130859375, "learning_rate": 1e-06, "loss": 0.6091, "mean_token_accuracy": 0.870468020439148, "num_tokens": 895192958.0, "step": 23466 }, { "epoch": 2.985243607683501, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.533935546875, "learning_rate": 1e-06, "loss": 0.6041, "mean_token_accuracy": 0.8747047185897827, "num_tokens": 895226710.0, "step": 23467 }, { "epoch": 2.9853708179620915, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.15994644165039, "learning_rate": 1e-06, "loss": 0.6197, "mean_token_accuracy": 0.8689553737640381, "num_tokens": 895267695.0, "step": 23468 }, { "epoch": 2.985498028240682, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.366180419921875, "learning_rate": 1e-06, "loss": 0.5905, "mean_token_accuracy": 0.8794442415237427, "num_tokens": 895303101.0, "step": 23469 }, { "epoch": 2.9856252385192725, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.989864349365234, "learning_rate": 1e-06, "loss": 0.6099, "mean_token_accuracy": 0.8703592419624329, "num_tokens": 895340437.0, "step": 23470 }, { "epoch": 2.9857524487978626, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.42869186401367, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8795795440673828, "num_tokens": 895377695.0, "step": 23471 }, { "epoch": 2.9858796590764536, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.17301940917969, "learning_rate": 1e-06, "loss": 0.6621, "mean_token_accuracy": 0.8558164834976196, "num_tokens": 895418599.0, "step": 23472 }, { "epoch": 2.9860068693550437, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.158382415771484, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8654072880744934, "num_tokens": 895454832.0, "step": 23473 }, { "epoch": 2.9861340796336346, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.224281311035156, "learning_rate": 1e-06, "loss": 0.6983, "mean_token_accuracy": 0.8459621071815491, "num_tokens": 895490607.0, "step": 23474 }, { "epoch": 2.9862612899122247, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 52.82769775390625, "learning_rate": 1e-06, "loss": 0.6434, "mean_token_accuracy": 0.8584886193275452, "num_tokens": 895531340.0, "step": 23475 }, { "epoch": 2.9863885001908153, "ewc_loss": 0.236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002117156982421875, "grad_norm": 53.39097213745117, "learning_rate": 1e-06, "loss": 0.638, "mean_token_accuracy": 0.8637009859085083, "num_tokens": 895568688.0, "step": 23476 }, { "epoch": 2.986515710469406, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 52.775718688964844, "learning_rate": 1e-06, "loss": 0.6637, "mean_token_accuracy": 0.8533228635787964, "num_tokens": 895607316.0, "step": 23477 }, { "epoch": 2.9866429207479963, "ewc_loss": 0.23828125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000213623046875, "grad_norm": 53.469390869140625, "learning_rate": 1e-06, "loss": 0.5827, "mean_token_accuracy": 0.882526159286499, "num_tokens": 895648694.0, "step": 23478 }, { "epoch": 2.986770131026587, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.21412658691406, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8597636818885803, "num_tokens": 895687996.0, "step": 23479 }, { "epoch": 2.9868973413051774, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.3161735534668, "learning_rate": 1e-06, "loss": 0.6185, "mean_token_accuracy": 0.8676188588142395, "num_tokens": 895726727.0, "step": 23480 }, { "epoch": 2.987024551583768, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.35174560546875, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8691838979721069, "num_tokens": 895760688.0, "step": 23481 }, { "epoch": 2.9871517618623584, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 52.96730422973633, "learning_rate": 1e-06, "loss": 0.6934, "mean_token_accuracy": 0.8451695442199707, "num_tokens": 895797303.0, "step": 23482 }, { "epoch": 2.987278972140949, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.46524429321289, "learning_rate": 1e-06, "loss": 0.6148, "mean_token_accuracy": 0.8701062798500061, "num_tokens": 895834078.0, "step": 23483 }, { "epoch": 2.9874061824195395, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 52.90487289428711, "learning_rate": 1e-06, "loss": 0.6695, "mean_token_accuracy": 0.8537611961364746, "num_tokens": 895872640.0, "step": 23484 }, { "epoch": 2.98753339269813, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.680057525634766, "learning_rate": 1e-06, "loss": 0.5935, "mean_token_accuracy": 0.8790744543075562, "num_tokens": 895907646.0, "step": 23485 }, { "epoch": 2.9876606029767205, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.28840255737305, "learning_rate": 1e-06, "loss": 0.5993, "mean_token_accuracy": 0.8733638525009155, "num_tokens": 895944183.0, "step": 23486 }, { "epoch": 2.987787813255311, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.31168746948242, "learning_rate": 1e-06, "loss": 0.624, "mean_token_accuracy": 0.8660542964935303, "num_tokens": 895983987.0, "step": 23487 }, { "epoch": 2.9879150235339016, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.32555389404297, "learning_rate": 1e-06, "loss": 0.6644, "mean_token_accuracy": 0.8523473739624023, "num_tokens": 896020928.0, "step": 23488 }, { "epoch": 2.988042233812492, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.3226203918457, "learning_rate": 1e-06, "loss": 0.6028, "mean_token_accuracy": 0.8720228672027588, "num_tokens": 896057459.0, "step": 23489 }, { "epoch": 2.9881694440910826, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.57924270629883, "learning_rate": 1e-06, "loss": 0.6412, "mean_token_accuracy": 0.8611128926277161, "num_tokens": 896095049.0, "step": 23490 }, { "epoch": 2.988296654369673, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.051456451416016, "learning_rate": 1e-06, "loss": 0.5977, "mean_token_accuracy": 0.8759332895278931, "num_tokens": 896127433.0, "step": 23491 }, { "epoch": 2.9884238646482637, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.750511169433594, "learning_rate": 1e-06, "loss": 0.6178, "mean_token_accuracy": 0.8703726530075073, "num_tokens": 896165662.0, "step": 23492 }, { "epoch": 2.9885510749268542, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.116512298583984, "learning_rate": 1e-06, "loss": 0.639, "mean_token_accuracy": 0.8604886531829834, "num_tokens": 896207753.0, "step": 23493 }, { "epoch": 2.9886782852054448, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.64756774902344, "learning_rate": 1e-06, "loss": 0.636, "mean_token_accuracy": 0.862472653388977, "num_tokens": 896249105.0, "step": 23494 }, { "epoch": 2.9888054954840353, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.56645584106445, "learning_rate": 1e-06, "loss": 0.6302, "mean_token_accuracy": 0.863757848739624, "num_tokens": 896291728.0, "step": 23495 }, { "epoch": 2.9889327057626254, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.59689712524414, "learning_rate": 1e-06, "loss": 0.6633, "mean_token_accuracy": 0.8529320359230042, "num_tokens": 896331729.0, "step": 23496 }, { "epoch": 2.9890599160412163, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.67683029174805, "learning_rate": 1e-06, "loss": 0.6633, "mean_token_accuracy": 0.8542015552520752, "num_tokens": 896371804.0, "step": 23497 }, { "epoch": 2.9891871263198064, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.11980056762695, "learning_rate": 1e-06, "loss": 0.6319, "mean_token_accuracy": 0.8609657883644104, "num_tokens": 896408398.0, "step": 23498 }, { "epoch": 2.9893143365983974, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.8101921081543, "learning_rate": 1e-06, "loss": 0.5973, "mean_token_accuracy": 0.8796589374542236, "num_tokens": 896440496.0, "step": 23499 }, { "epoch": 2.9894415468769875, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.62727737426758, "learning_rate": 1e-06, "loss": 0.6294, "mean_token_accuracy": 0.8667498826980591, "num_tokens": 896475212.0, "step": 23500 }, { "epoch": 2.989568757155578, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.3252067565918, "learning_rate": 1e-06, "loss": 0.6429, "mean_token_accuracy": 0.8600558042526245, "num_tokens": 896517122.0, "step": 23501 }, { "epoch": 2.9896959674341685, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.49821853637695, "learning_rate": 1e-06, "loss": 0.6105, "mean_token_accuracy": 0.8714171648025513, "num_tokens": 896557259.0, "step": 23502 }, { "epoch": 2.989823177712759, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.3990478515625, "learning_rate": 1e-06, "loss": 0.6583, "mean_token_accuracy": 0.8554954528808594, "num_tokens": 896600237.0, "step": 23503 }, { "epoch": 2.9899503879913496, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.78671646118164, "learning_rate": 1e-06, "loss": 0.5925, "mean_token_accuracy": 0.8764909505844116, "num_tokens": 896640564.0, "step": 23504 }, { "epoch": 2.99007759826994, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.26145935058594, "learning_rate": 1e-06, "loss": 0.5866, "mean_token_accuracy": 0.8770617842674255, "num_tokens": 896674164.0, "step": 23505 }, { "epoch": 2.9902048085485307, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 53.83760452270508, "learning_rate": 1e-06, "loss": 0.66, "mean_token_accuracy": 0.8595339059829712, "num_tokens": 896716613.0, "step": 23506 }, { "epoch": 2.990332018827121, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.55967712402344, "learning_rate": 1e-06, "loss": 0.6628, "mean_token_accuracy": 0.8550406694412231, "num_tokens": 896755586.0, "step": 23507 }, { "epoch": 2.9904592291057117, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 53.82326889038086, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.862552285194397, "num_tokens": 896797641.0, "step": 23508 }, { "epoch": 2.9905864393843022, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.59376907348633, "learning_rate": 1e-06, "loss": 0.6411, "mean_token_accuracy": 0.8613801002502441, "num_tokens": 896835113.0, "step": 23509 }, { "epoch": 2.9907136496628928, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.15837860107422, "learning_rate": 1e-06, "loss": 0.6221, "mean_token_accuracy": 0.8640093803405762, "num_tokens": 896866496.0, "step": 23510 }, { "epoch": 2.9908408599414833, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.30704116821289, "learning_rate": 1e-06, "loss": 0.6428, "mean_token_accuracy": 0.8616772294044495, "num_tokens": 896907535.0, "step": 23511 }, { "epoch": 2.990968070220074, "ewc_loss": 0.2294921875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020503997802734375, "grad_norm": 53.153839111328125, "learning_rate": 1e-06, "loss": 0.5712, "mean_token_accuracy": 0.8811507821083069, "num_tokens": 896948574.0, "step": 23512 }, { "epoch": 2.9910952804986644, "ewc_loss": 0.236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002117156982421875, "grad_norm": 54.75286102294922, "learning_rate": 1e-06, "loss": 0.6585, "mean_token_accuracy": 0.8578649759292603, "num_tokens": 896992377.0, "step": 23513 }, { "epoch": 2.991222490777255, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 52.493385314941406, "learning_rate": 1e-06, "loss": 0.6386, "mean_token_accuracy": 0.8604800701141357, "num_tokens": 897035411.0, "step": 23514 }, { "epoch": 2.9913497010558454, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 55.442344665527344, "learning_rate": 1e-06, "loss": 0.6388, "mean_token_accuracy": 0.8663653135299683, "num_tokens": 897074661.0, "step": 23515 }, { "epoch": 2.991476911334436, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.82942581176758, "learning_rate": 1e-06, "loss": 0.6201, "mean_token_accuracy": 0.8659524917602539, "num_tokens": 897115550.0, "step": 23516 }, { "epoch": 2.9916041216130265, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 54.55860137939453, "learning_rate": 1e-06, "loss": 0.6542, "mean_token_accuracy": 0.8577237725257874, "num_tokens": 897155091.0, "step": 23517 }, { "epoch": 2.991731331891617, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 54.1722526550293, "learning_rate": 1e-06, "loss": 0.6373, "mean_token_accuracy": 0.8607540130615234, "num_tokens": 897196551.0, "step": 23518 }, { "epoch": 2.991858542170207, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.4930305480957, "learning_rate": 1e-06, "loss": 0.5931, "mean_token_accuracy": 0.8741892576217651, "num_tokens": 897231885.0, "step": 23519 }, { "epoch": 2.991985752448798, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.30405807495117, "learning_rate": 1e-06, "loss": 0.631, "mean_token_accuracy": 0.8625374436378479, "num_tokens": 897267944.0, "step": 23520 }, { "epoch": 2.992112962727388, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.56902313232422, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8677301406860352, "num_tokens": 897307330.0, "step": 23521 }, { "epoch": 2.992240173005979, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.9018440246582, "learning_rate": 1e-06, "loss": 0.6277, "mean_token_accuracy": 0.8654305338859558, "num_tokens": 897344028.0, "step": 23522 }, { "epoch": 2.992367383284569, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.022403717041016, "learning_rate": 1e-06, "loss": 0.6545, "mean_token_accuracy": 0.8560912609100342, "num_tokens": 897391213.0, "step": 23523 }, { "epoch": 2.99249459356316, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.50101852416992, "learning_rate": 1e-06, "loss": 0.651, "mean_token_accuracy": 0.857717752456665, "num_tokens": 897428222.0, "step": 23524 }, { "epoch": 2.9926218038417502, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.884788513183594, "learning_rate": 1e-06, "loss": 0.6825, "mean_token_accuracy": 0.8513707518577576, "num_tokens": 897457737.0, "step": 23525 }, { "epoch": 2.992749014120341, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.53562545776367, "learning_rate": 1e-06, "loss": 0.6002, "mean_token_accuracy": 0.8732044696807861, "num_tokens": 897498925.0, "step": 23526 }, { "epoch": 2.9928762243989313, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.5106086730957, "learning_rate": 1e-06, "loss": 0.6504, "mean_token_accuracy": 0.8596549034118652, "num_tokens": 897533157.0, "step": 23527 }, { "epoch": 2.993003434677522, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.66695022583008, "learning_rate": 1e-06, "loss": 0.6216, "mean_token_accuracy": 0.8656868934631348, "num_tokens": 897574159.0, "step": 23528 }, { "epoch": 2.9931306449561124, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 54.210445404052734, "learning_rate": 1e-06, "loss": 0.6531, "mean_token_accuracy": 0.8548873662948608, "num_tokens": 897606970.0, "step": 23529 }, { "epoch": 2.993257855234703, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 54.29631805419922, "learning_rate": 1e-06, "loss": 0.7075, "mean_token_accuracy": 0.8399168848991394, "num_tokens": 897647725.0, "step": 23530 }, { "epoch": 2.9933850655132934, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.48487091064453, "learning_rate": 1e-06, "loss": 0.6208, "mean_token_accuracy": 0.8661933541297913, "num_tokens": 897687109.0, "step": 23531 }, { "epoch": 2.993512275791884, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 54.60457229614258, "learning_rate": 1e-06, "loss": 0.6048, "mean_token_accuracy": 0.8751281499862671, "num_tokens": 897720994.0, "step": 23532 }, { "epoch": 2.9936394860704745, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.63091278076172, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8800885081291199, "num_tokens": 897749855.0, "step": 23533 }, { "epoch": 2.993766696349065, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 54.505859375, "learning_rate": 1e-06, "loss": 0.6255, "mean_token_accuracy": 0.8603014945983887, "num_tokens": 897792846.0, "step": 23534 }, { "epoch": 2.9938939066276555, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.3051643371582, "learning_rate": 1e-06, "loss": 0.6173, "mean_token_accuracy": 0.8650797605514526, "num_tokens": 897836765.0, "step": 23535 }, { "epoch": 2.994021116906246, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 55.07276153564453, "learning_rate": 1e-06, "loss": 0.629, "mean_token_accuracy": 0.8649398684501648, "num_tokens": 897880918.0, "step": 23536 }, { "epoch": 2.9941483271848366, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 52.678592681884766, "learning_rate": 1e-06, "loss": 0.6226, "mean_token_accuracy": 0.860386073589325, "num_tokens": 897916676.0, "step": 23537 }, { "epoch": 2.994275537463427, "ewc_loss": 0.236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002117156982421875, "grad_norm": 55.184967041015625, "learning_rate": 1e-06, "loss": 0.6139, "mean_token_accuracy": 0.8709740042686462, "num_tokens": 897953968.0, "step": 23538 }, { "epoch": 2.9944027477420176, "ewc_loss": 0.220703125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001964569091796875, "grad_norm": 53.24406814575195, "learning_rate": 1e-06, "loss": 0.6364, "mean_token_accuracy": 0.8644479513168335, "num_tokens": 897993055.0, "step": 23539 }, { "epoch": 2.994529958020608, "ewc_loss": 0.2373046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021266937255859375, "grad_norm": 55.28359603881836, "learning_rate": 1e-06, "loss": 0.6018, "mean_token_accuracy": 0.8739912509918213, "num_tokens": 898030202.0, "step": 23540 }, { "epoch": 2.9946571682991987, "ewc_loss": 0.2216796875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019741058349609375, "grad_norm": 53.41278839111328, "learning_rate": 1e-06, "loss": 0.627, "mean_token_accuracy": 0.8596339225769043, "num_tokens": 898067234.0, "step": 23541 }, { "epoch": 2.9947843785777892, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.165523529052734, "learning_rate": 1e-06, "loss": 0.6134, "mean_token_accuracy": 0.8683511018753052, "num_tokens": 898105423.0, "step": 23542 }, { "epoch": 2.9949115888563798, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 53.81056594848633, "learning_rate": 1e-06, "loss": 0.5575, "mean_token_accuracy": 0.8838653564453125, "num_tokens": 898144550.0, "step": 23543 }, { "epoch": 2.99503879913497, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.41283416748047, "learning_rate": 1e-06, "loss": 0.6252, "mean_token_accuracy": 0.8670424222946167, "num_tokens": 898179277.0, "step": 23544 }, { "epoch": 2.995166009413561, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.2514762878418, "learning_rate": 1e-06, "loss": 0.6218, "mean_token_accuracy": 0.8633456230163574, "num_tokens": 898218518.0, "step": 23545 }, { "epoch": 2.995293219692151, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.992774963378906, "learning_rate": 1e-06, "loss": 0.6594, "mean_token_accuracy": 0.8514766693115234, "num_tokens": 898259266.0, "step": 23546 }, { "epoch": 2.995420429970742, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.291160583496094, "learning_rate": 1e-06, "loss": 0.6623, "mean_token_accuracy": 0.8620957136154175, "num_tokens": 898292648.0, "step": 23547 }, { "epoch": 2.995547640249332, "ewc_loss": 0.2255859375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020122528076171875, "grad_norm": 53.20185089111328, "learning_rate": 1e-06, "loss": 0.6209, "mean_token_accuracy": 0.8669838905334473, "num_tokens": 898328413.0, "step": 23548 }, { "epoch": 2.995674850527923, "ewc_loss": 0.234375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000209808349609375, "grad_norm": 54.308677673339844, "learning_rate": 1e-06, "loss": 0.6313, "mean_token_accuracy": 0.8662581443786621, "num_tokens": 898376963.0, "step": 23549 }, { "epoch": 2.995802060806513, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.849266052246094, "learning_rate": 1e-06, "loss": 0.5848, "mean_token_accuracy": 0.8763657808303833, "num_tokens": 898407256.0, "step": 23550 }, { "epoch": 2.9959292710851035, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.83777618408203, "learning_rate": 1e-06, "loss": 0.6323, "mean_token_accuracy": 0.8673108816146851, "num_tokens": 898448800.0, "step": 23551 }, { "epoch": 2.996056481363694, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.72886657714844, "learning_rate": 1e-06, "loss": 0.6456, "mean_token_accuracy": 0.8602548837661743, "num_tokens": 898488912.0, "step": 23552 }, { "epoch": 2.9961836916422846, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.64997100830078, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.85883629322052, "num_tokens": 898530434.0, "step": 23553 }, { "epoch": 2.996310901920875, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.68373107910156, "learning_rate": 1e-06, "loss": 0.6251, "mean_token_accuracy": 0.8646999001502991, "num_tokens": 898565044.0, "step": 23554 }, { "epoch": 2.9964381121994657, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.783241271972656, "learning_rate": 1e-06, "loss": 0.6027, "mean_token_accuracy": 0.8759981989860535, "num_tokens": 898601798.0, "step": 23555 }, { "epoch": 2.996565322478056, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.778160095214844, "learning_rate": 1e-06, "loss": 0.6569, "mean_token_accuracy": 0.854447066783905, "num_tokens": 898644789.0, "step": 23556 }, { "epoch": 2.9966925327566467, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.1411018371582, "learning_rate": 1e-06, "loss": 0.6327, "mean_token_accuracy": 0.8638708591461182, "num_tokens": 898683928.0, "step": 23557 }, { "epoch": 2.9968197430352372, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.804019927978516, "learning_rate": 1e-06, "loss": 0.6421, "mean_token_accuracy": 0.8594751358032227, "num_tokens": 898726217.0, "step": 23558 }, { "epoch": 2.9969469533138278, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.75543975830078, "learning_rate": 1e-06, "loss": 0.5962, "mean_token_accuracy": 0.8731398582458496, "num_tokens": 898760575.0, "step": 23559 }, { "epoch": 2.9970741635924183, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.60396194458008, "learning_rate": 1e-06, "loss": 0.6341, "mean_token_accuracy": 0.8615258932113647, "num_tokens": 898801232.0, "step": 23560 }, { "epoch": 2.997201373871009, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 54.089317321777344, "learning_rate": 1e-06, "loss": 0.601, "mean_token_accuracy": 0.8718326091766357, "num_tokens": 898837174.0, "step": 23561 }, { "epoch": 2.9973285841495994, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.89447784423828, "learning_rate": 1e-06, "loss": 0.6681, "mean_token_accuracy": 0.8543484210968018, "num_tokens": 898875849.0, "step": 23562 }, { "epoch": 2.99745579442819, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 53.554134368896484, "learning_rate": 1e-06, "loss": 0.6074, "mean_token_accuracy": 0.8730393648147583, "num_tokens": 898909975.0, "step": 23563 }, { "epoch": 2.9975830047067804, "ewc_loss": 0.232421875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002079010009765625, "grad_norm": 54.19563293457031, "learning_rate": 1e-06, "loss": 0.6527, "mean_token_accuracy": 0.8562359809875488, "num_tokens": 898948250.0, "step": 23564 }, { "epoch": 2.997710214985371, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.977622985839844, "learning_rate": 1e-06, "loss": 0.5854, "mean_token_accuracy": 0.8802510499954224, "num_tokens": 898989122.0, "step": 23565 }, { "epoch": 2.9978374252639615, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.08015441894531, "learning_rate": 1e-06, "loss": 0.6447, "mean_token_accuracy": 0.86027592420578, "num_tokens": 899028715.0, "step": 23566 }, { "epoch": 2.997964635542552, "ewc_loss": 0.2265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.000202178955078125, "grad_norm": 53.09165954589844, "learning_rate": 1e-06, "loss": 0.6096, "mean_token_accuracy": 0.8675084114074707, "num_tokens": 899064101.0, "step": 23567 }, { "epoch": 2.9980918458211425, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.491455078125, "learning_rate": 1e-06, "loss": 0.6289, "mean_token_accuracy": 0.867958664894104, "num_tokens": 899101759.0, "step": 23568 }, { "epoch": 2.9982190560997326, "ewc_loss": 0.2275390625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020313262939453125, "grad_norm": 52.994956970214844, "learning_rate": 1e-06, "loss": 0.6471, "mean_token_accuracy": 0.8573267459869385, "num_tokens": 899142273.0, "step": 23569 }, { "epoch": 2.9983462663783236, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.455753326416016, "learning_rate": 1e-06, "loss": 0.6276, "mean_token_accuracy": 0.8653054237365723, "num_tokens": 899175878.0, "step": 23570 }, { "epoch": 2.9984734766569137, "ewc_loss": 0.22265625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001983642578125, "grad_norm": 53.491275787353516, "learning_rate": 1e-06, "loss": 0.574, "mean_token_accuracy": 0.8758161664009094, "num_tokens": 899208937.0, "step": 23571 }, { "epoch": 2.9986006869355046, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.08572006225586, "learning_rate": 1e-06, "loss": 0.6344, "mean_token_accuracy": 0.8642632961273193, "num_tokens": 899244088.0, "step": 23572 }, { "epoch": 2.9987278972140947, "ewc_loss": 0.228515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002040863037109375, "grad_norm": 53.60318374633789, "learning_rate": 1e-06, "loss": 0.6497, "mean_token_accuracy": 0.8600296974182129, "num_tokens": 899280111.0, "step": 23573 }, { "epoch": 2.9988551074926852, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 53.95406723022461, "learning_rate": 1e-06, "loss": 0.6365, "mean_token_accuracy": 0.8639534711837769, "num_tokens": 899310882.0, "step": 23574 }, { "epoch": 2.9989823177712758, "ewc_loss": 0.2314453125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020694732666015625, "grad_norm": 53.799800872802734, "learning_rate": 1e-06, "loss": 0.6858, "mean_token_accuracy": 0.8515738248825073, "num_tokens": 899346820.0, "step": 23575 }, { "epoch": 2.9991095280498663, "ewc_loss": 0.2333984375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020885467529296875, "grad_norm": 54.20637512207031, "learning_rate": 1e-06, "loss": 0.5844, "mean_token_accuracy": 0.8789898157119751, "num_tokens": 899382044.0, "step": 23576 }, { "epoch": 2.999236738328457, "ewc_loss": 0.23046875, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00020599365234375, "grad_norm": 53.252098083496094, "learning_rate": 1e-06, "loss": 0.6156, "mean_token_accuracy": 0.865043580532074, "num_tokens": 899418389.0, "step": 23577 }, { "epoch": 2.9993639486070474, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 55.04743194580078, "learning_rate": 1e-06, "loss": 0.6478, "mean_token_accuracy": 0.8615809082984924, "num_tokens": 899458486.0, "step": 23578 }, { "epoch": 2.999491158885638, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.013641357421875, "learning_rate": 1e-06, "loss": 0.6303, "mean_token_accuracy": 0.8629188537597656, "num_tokens": 899498168.0, "step": 23579 }, { "epoch": 2.9996183691642284, "ewc_loss": 0.236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002117156982421875, "grad_norm": 54.66898727416992, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8795205354690552, "num_tokens": 899538655.0, "step": 23580 }, { "epoch": 2.999745579442819, "ewc_loss": 0.2236328125, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00019931793212890625, "grad_norm": 53.23987579345703, "learning_rate": 1e-06, "loss": 0.6578, "mean_token_accuracy": 0.8524233102798462, "num_tokens": 899581693.0, "step": 23581 }, { "epoch": 2.9998727897214095, "ewc_loss": 0.2353515625, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00021076202392578125, "grad_norm": 54.89610290527344, "learning_rate": 1e-06, "loss": 0.6462, "mean_token_accuracy": 0.862053632736206, "num_tokens": 899623817.0, "step": 23582 }, { "epoch": 3.0, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "grad_norm": 53.4432258605957, "learning_rate": 1e-06, "loss": 0.616, "mean_token_accuracy": 0.8680695295333862, "num_tokens": 899664226.0, "step": 23583 }, { "epoch": 3.0, "ewc_loss": 0.224609375, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0002002716064453125, "step": 23583, "total_flos": 5.62815163329864e+19, "train_loss": 0.5611200594813073, "train_runtime": 47572.7611, "train_samples_per_second": 7.931, "train_steps_per_second": 0.496 } ], "logging_steps": 1, "max_steps": 23583, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 11792, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.62815163329864e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }